1 files changed, 2 insertions, 2 deletions
diff --git a/drivers/acpi/acpica/exresolv.c b/drivers/acpi/acpica/exresolv.c
index ac04278ad28..3cde553bcbe 100644
--- a/drivers/acpi/acpica/exresolv.c
+++ b/drivers/acpi/acpica/exresolv.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2013, Intel Corp.
+ * Copyright (C) 2000 - 2014, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -521,7 +521,7 @@ acpi_ex_resolve_multiple(struct acpi_walk_state *walk_state,
 	 */
 	type = obj_desc->common.type;
 
-      exit:
+exit:
 	/* Convert internal types to external types */
 
 	switch (type) {
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a..6489e1fc1af 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
 config 9P_FS
-	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
-	depends on INET && NET_9P && EXPERIMENTAL
+	tristate "Plan 9 Resource Sharing Support (9P2000)"
+	depends on INET && NET_9P
 	help
 	  If you say Y here, you will get experimental support for
 	  Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,10 +10,8 @@ config 9P_FS
 	  If unsure, say N.
 
 if 9P_FS
-
 config 9P_FSCACHE
-	bool "Enable 9P client caching support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Enable 9P client caching support"
 	depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
 	help
 	  Choose Y here to enable persistent, read-only local
@@ -33,3 +31,16 @@ config 9P_FS_POSIX_ACL
 	  If you don't know what Access Control Lists are, say N
 
 endif
+
+
+config 9P_FS_SECURITY
+        bool "9P Security Labels"
+        depends on 9P_FS
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the 9P filesystem.
+
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c1278063..ff7be98f84f 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
 	v9fs.o \
 	fid.o  \
 	xattr.o \
-	xattr_user.o
+	xattr_user.o \
+	xattr_trusted.o
 
 9p-$(CONFIG_9P_FSCACHE) += cache.o
 9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
+9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 535ab6eccb1..8482f2d1160 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -23,6 +23,7 @@
 #include "acl.h"
 #include "v9fs.h"
 #include "v9fs_vfs.h"
+#include "fid.h"
 
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -37,7 +38,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 			return ERR_PTR(-ENOMEM);
 		size = v9fs_fid_xattr_get(fid, name, value, size);
 		if (size > 0) {
-			acl = posix_acl_from_xattr(value, size);
+			acl = posix_acl_from_xattr(&init_user_ns, value, size);
 			if (IS_ERR(acl))
 				goto err_out;
 		}
@@ -96,14 +97,10 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 	return acl;
 }
 
-int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
+struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type)
 {
-	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
 	v9ses = v9fs_inode2v9ses(inode);
 	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
 			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
@@ -111,30 +108,18 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 		 * On access = client  and acl = on mode get the acl
 		 * values from the server
 		 */
-		return 0;
+		return NULL;
 	}
-	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	return v9fs_get_cached_acl(inode, type);
 
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl) {
-		int error = posix_acl_permission(inode, acl, mask);
-		posix_acl_release(acl);
-		return error;
-	}
-	return -EAGAIN;
 }
 
-static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
 {
 	int retval;
 	char *name;
 	size_t size;
 	void *buffer;
-	struct inode *inode = dentry->d_inode;
-
-	set_cached_acl(inode, type, acl);
-
 	if (!acl)
 		return 0;
 
@@ -143,7 +128,7 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
 	buffer = kmalloc(size, GFP_KERNEL);
 	if (!buffer)
 		return -ENOMEM;
-	retval = posix_acl_to_xattr(acl, buffer, size);
+	retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
 	if (retval < 0)
 		goto err_free_out;
 	switch (type) {
@@ -156,49 +141,53 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
 	default:
 		BUG();
 	}
-	retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+	retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);
 err_free_out:
 	kfree(buffer);
 	return retval;
 }
 
-int v9fs_acl_chmod(struct dentry *dentry)
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
 {
 	int retval = 0;
-	struct posix_acl *acl, *clone;
-	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
 	if (acl) {
-		clone = posix_acl_clone(acl, GFP_KERNEL);
+		retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+		if (retval)
+			return retval;
+		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+		retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
 		posix_acl_release(acl);
-		if (!clone)
-			return -ENOMEM;
-		retval = posix_acl_chmod_masq(clone, inode->i_mode);
-		if (!retval)
-			retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
-		posix_acl_release(clone);
 	}
 	return retval;
 }
 
-int v9fs_set_create_acl(struct dentry *dentry,
-			struct posix_acl *dpacl, struct posix_acl *pacl)
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+			struct posix_acl *dacl, struct posix_acl *acl)
 {
-	v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-	v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
-	posix_acl_release(dpacl);
-	posix_acl_release(pacl);
+	set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+	set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+	v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl);
+	v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
 	return 0;
 }
 
-int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+void v9fs_put_acl(struct posix_acl *dacl,
+		  struct posix_acl *acl)
+{
+	posix_acl_release(dacl);
+	posix_acl_release(acl);
+}
+
+int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 		  struct posix_acl **dpacl, struct posix_acl **pacl)
 {
 	int retval = 0;
-	mode_t mode = *modep;
+	umode_t mode = *modep;
 	struct posix_acl *acl = NULL;
 
 	if (!S_ISLNK(mode)) {
@@ -209,29 +198,18 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep,
 			mode &= ~current_umask();
 	}
 	if (acl) {
-		struct posix_acl *clone;
-
 		if (S_ISDIR(mode))
-			*dpacl = acl;
-		clone = posix_acl_clone(acl, GFP_NOFS);
-		retval = -ENOMEM;
-		if (!clone)
-			goto cleanup;
-
-		retval = posix_acl_create_masq(clone, &mode);
-		if (retval < 0) {
-			posix_acl_release(clone);
-			goto cleanup;
-		}
+			*dpacl = posix_acl_dup(acl);
+		retval = __posix_acl_create(&acl, GFP_NOFS, &mode);
+		if (retval < 0)
+			return retval;
 		if (retval > 0)
-			*pacl = clone;
+			*pacl = acl;
+		else
+			posix_acl_release(acl);
 	}
 	*modep  = mode;
 	return 0;
-cleanup:
-	posix_acl_release(acl);
-	return retval;
-
 }
 
 static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
@@ -274,7 +252,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 		return PTR_ERR(acl);
 	if (acl == NULL)
 		return -ENODATA;
-	error = posix_acl_to_xattr(acl, buffer, size);
+	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
 	posix_acl_release(acl);
 
 	return error;
@@ -327,7 +305,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 		return -EPERM;
 	if (value) {
 		/* update the cached acl value */
-		acl = posix_acl_from_xattr(value, size);
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 		else if (acl) {
@@ -342,7 +320,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			mode_t mode = inode->i_mode;
+			umode_t mode = inode->i_mode;
 			retval = posix_acl_equiv_mode(acl, &mode);
 			if (retval < 0)
 				goto err_out;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 7ef3ac9f6d9..e4f7e882272 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,29 +16,35 @@
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
-extern int v9fs_acl_chmod(struct dentry *);
-extern int v9fs_set_create_acl(struct dentry *,
+extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
+extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
+extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
 			       struct posix_acl *, struct posix_acl *);
-extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 			 struct posix_acl **dpacl, struct posix_acl **pacl);
+extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
 #else
-#define v9fs_check_acl NULL
+#define v9fs_iop_get_acl NULL
 static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	return 0;
 }
-static inline int v9fs_acl_chmod(struct dentry *dentry)
+static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
 {
 	return 0;
 }
-static inline int v9fs_set_create_acl(struct dentry *dentry,
-				      struct posix_acl *dpacl,
-				      struct posix_acl *pacl)
+static inline int v9fs_set_create_acl(struct inode *inode,
+				      struct p9_fid *fid,
+				      struct posix_acl *dacl,
+				      struct posix_acl *acl)
 {
 	return 0;
 }
-static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+static inline void v9fs_put_acl(struct posix_acl *dacl,
+				struct posix_acl *acl)
+{
+}
+static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 				struct posix_acl **dpacl,
 				struct posix_acl **pacl)
 {
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 5b335c5086a..a69260f2755 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 	uint16_t klen = 0;
 
 	v9ses = (struct v9fs_session_info *)cookie_netfs_data;
-	P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
-		   buffer, bufmax);
+	p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
+		 v9ses, buffer, bufmax);
 
 	if (v9ses->cachetag)
 		klen = strlen(v9ses->cachetag);
@@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 		return 0;
 
 	memcpy(buffer, v9ses->cachetag, klen);
-	P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+	p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
 	return klen;
 }
 
@@ -90,15 +90,15 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
 
 	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
 						&v9fs_cache_session_index_def,
-						v9ses);
-	P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
-		   v9ses->fscache);
+						v9ses, true);
+	p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
+		 v9ses, v9ses->fscache);
 }
 
 void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 {
-	P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
-		   v9ses->fscache);
+	p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
+		 v9ses, v9ses->fscache);
 	fscache_relinquish_cookie(v9ses->fscache, 0);
 	v9ses->fscache = NULL;
 }
@@ -108,11 +108,10 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
 					 void *buffer, uint16_t bufmax)
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
-	memcpy(buffer, &v9inode->fscache_key->path,
-	       sizeof(v9inode->fscache_key->path));
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-		   v9inode->fscache_key->path);
-	return sizeof(v9inode->fscache_key->path);
+	memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
+	p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
+		 &v9inode->vfs_inode, v9inode->qid.path);
+	return sizeof(v9inode->qid.path);
 }
 
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
@@ -121,19 +120,18 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
 	*size = i_size_read(&v9inode->vfs_inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
-		   *size);
+	p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
+		 &v9inode->vfs_inode, *size);
 }
 
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
 					 void *buffer, uint16_t buflen)
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
-	memcpy(buffer, &v9inode->fscache_key->version,
-	       sizeof(v9inode->fscache_key->version));
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-		   v9inode->fscache_key->version);
-	return sizeof(v9inode->fscache_key->version);
+	memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
+	p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
+		 &v9inode->vfs_inode, v9inode->qid.version);
+	return sizeof(v9inode->qid.version);
 }
 
 static enum
@@ -143,11 +141,11 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
 
-	if (buflen != sizeof(v9inode->fscache_key->version))
+	if (buflen != sizeof(v9inode->qid.version))
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
-	if (memcmp(buffer, &v9inode->fscache_key->version,
-		   sizeof(v9inode->fscache_key->version)))
+	if (memcmp(buffer, &v9inode->qid.version,
+		   sizeof(v9inode->qid.version)))
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
 	return FSCACHE_CHECKAUX_OKAY;
@@ -206,10 +204,10 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-		   v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
+		 inode, v9inode->fscache);
 }
 
 void v9fs_cache_inode_put_cookie(struct inode *inode)
@@ -218,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
 
 	if (!v9inode->fscache)
 		return;
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-		   v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
+		 inode, v9inode->fscache);
 
 	fscache_relinquish_cookie(v9inode->fscache, 0);
 	v9inode->fscache = NULL;
@@ -231,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
 
 	if (!v9inode->fscache)
 		return;
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-		   v9inode->fscache);
+	p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
+		 inode, v9inode->fscache);
 
 	fscache_relinquish_cookie(v9inode->fscache, 1);
 	v9inode->fscache = NULL;
@@ -241,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
 	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct p9_fid *fid;
 
 	if (!v9inode->fscache)
 		return;
 
 	spin_lock(&v9inode->fscache_lock);
-	fid = filp->private_data;
+
 	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
 		v9fs_cache_inode_flush_cookie(inode);
 	else
@@ -273,9 +270,9 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-		   inode, old, v9inode->fscache);
+						  v9inode, true);
+	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
+		 inode, old, v9inode->fscache);
 
 	spin_unlock(&v9inode->fscache_lock);
 }
@@ -325,7 +322,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 	int ret;
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
 	if (!v9inode->fscache)
 		return -ENOBUFS;
 
@@ -337,13 +334,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 	switch (ret) {
 	case -ENOBUFS:
 	case -ENODATA:
-		P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+		p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
 		return 1;
 	case 0:
-		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
 		return ret;
 	default:
-		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
 		return ret;
 	}
 }
@@ -363,7 +360,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 	int ret;
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+	p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
 	if (!v9inode->fscache)
 		return -ENOBUFS;
 
@@ -375,15 +372,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 	switch (ret) {
 	case -ENOBUFS:
 	case -ENODATA:
-		P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+		p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
 		return 1;
 	case 0:
 		BUG_ON(!list_empty(pages));
 		BUG_ON(*nr_pages != 0);
-		P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+		p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
 		return ret;
 	default:
-		P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+		p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
 		return ret;
 	}
 }
@@ -398,9 +395,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 	int ret;
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
 	ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
-	P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
+	p9_debug(P9_DEBUG_FSC, "ret =  %d\n", ret);
 	if (ret != 0)
 		v9fs_uncache_page(inode, page);
 }
@@ -411,7 +408,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
 {
 	const struct v9fs_inode *v9inode = V9FS_I(inode);
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
 	if (PageFsCache(page))
 		fscache_wait_on_page_write(v9inode->fscache, page);
 }
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 049507a5b01..2f967549109 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -93,15 +93,6 @@ static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 	BUG_ON(PageFsCache(page));
 }
 
-static inline void v9fs_fscache_set_key(struct inode *inode,
-					struct p9_qid *qid)
-{
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	spin_lock(&v9inode->fscache_lock);
-	v9inode->fscache_key = qid;
-	spin_unlock(&v9inode->fscache_lock);
-}
-
 static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
 						   struct page *page)
 {
@@ -110,6 +101,18 @@ static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
 
 #else /* CONFIG_9P_FSCACHE */
 
+static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+}
+
+static inline void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+}
+
+static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file)
+{
+}
+
 static inline int v9fs_fscache_release_page(struct page *page,
 					    gfp_t gfp) {
 	return 1;
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 0ee594569dc..d51ec9fafcc 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -41,29 +41,16 @@
  *
  */
 
-int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
 {
-	struct v9fs_dentry *dent;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
-					fid->fid, dentry->d_name.name);
-
-	dent = dentry->d_fsdata;
-	if (!dent) {
-		dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL);
-		if (!dent)
-			return -ENOMEM;
-
-		spin_lock_init(&dent->lock);
-		INIT_LIST_HEAD(&dent->fidlist);
-		dentry->d_fsdata = dent;
-	}
-
-	spin_lock(&dent->lock);
-	list_add(&fid->dlist, &dent->fidlist);
-	spin_unlock(&dent->lock);
+	hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
+}
 
-	return 0;
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+{
+	spin_lock(&dentry->d_lock);
+	__add_fid(dentry, fid);
+	spin_unlock(&dentry->d_lock);
 }
 
 /**
@@ -74,24 +61,25 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
  *
  */
 
-static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
+static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
 {
-	struct v9fs_dentry *dent;
 	struct p9_fid *fid, *ret;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
-		dentry->d_name.name, dentry, uid, any);
-	dent = (struct v9fs_dentry *) dentry->d_fsdata;
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
+		 dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid),
+		 any);
 	ret = NULL;
-	if (dent) {
-		spin_lock(&dent->lock);
-		list_for_each_entry(fid, &dent->fidlist, dlist) {
-			if (any || fid->uid == uid) {
+	/* we'll recheck under lock if there's anything to look in */
+	if (dentry->d_fsdata) {
+		struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+		spin_lock(&dentry->d_lock);
+		hlist_for_each_entry(fid, h, dlist) {
+			if (any || uid_eq(fid->uid, uid)) {
 				ret = fid;
 				break;
 			}
 		}
-		spin_unlock(&dent->lock);
+		spin_unlock(&dentry->d_lock);
 	}
 
 	return ret;
@@ -126,7 +114,7 @@ err_out:
 }
 
 static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
-					       uid_t uid, int any)
+					       kuid_t uid, int any)
 {
 	struct dentry *ds;
 	char **wnames, *uname;
@@ -214,8 +202,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
 	}
 	kfree(wnames);
 fid_out:
-	if (!IS_ERR(fid))
-		v9fs_fid_add(dentry, fid);
+	if (!IS_ERR(fid)) {
+		spin_lock(&dentry->d_lock);
+		if (d_unhashed(dentry)) {
+			spin_unlock(&dentry->d_lock);
+			p9_client_clunk(fid);
+			fid = ERR_PTR(-ENOENT);
+		} else {
+			__add_fid(dentry, fid);
+			spin_unlock(&dentry->d_lock);
+		}
+	}
 err_out:
 	up_read(&v9ses->rename_sem);
 	return fid;
@@ -233,7 +230,7 @@ err_out:
 
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
-	uid_t uid;
+	kuid_t uid;
 	int  any, access;
 	struct v9fs_session_info *v9ses;
 
@@ -253,7 +250,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 		break;
 
 	default:
-		uid = ~0;
+		uid = INVALID_UID;
 		any = 0;
 		break;
 	}
@@ -272,7 +269,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 	return ret;
 }
 
-static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)
 {
 	struct p9_fid *fid, *ret;
 
@@ -286,12 +283,10 @@ static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
 
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 {
-	int err, flags;
+	int err;
 	struct p9_fid *fid;
-	struct v9fs_session_info *v9ses;
 
-	v9ses = v9fs_dentry2v9ses(dentry);
-	fid = v9fs_fid_clone_with_uid(dentry, 0);
+	fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);
 	if (IS_ERR(fid))
 		goto error_out;
 	/*
@@ -299,17 +294,8 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	 * dirty pages. We always request for the open fid in read-write
 	 * mode so that a partial page write which result in page
 	 * read can work.
-	 *
-	 * we don't have a tsyncfs operation for older version
-	 * of protocol. So make sure the write back fid is
-	 * opened in O_SYNC mode.
 	 */
-	if (!v9fs_proto_dotl(v9ses))
-		flags = O_RDWR | O_SYNC;
-	else
-		flags = O_RDWR;
-
-	err = p9_client_open(fid, flags);
+	err = p9_client_open(fid, O_RDWR);
 	if (err < 0) {
 		p9_client_clunk(fid);
 		fid = ERR_PTR(err);
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index bb0b6e7f58f..2b6787fcb62 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -23,28 +23,8 @@
 #define FS_9P_FID_H
 #include <linux/list.h>
 
-/**
- * struct v9fs_dentry - 9p private data stored in dentry d_fsdata
- * @lock: protects the fidlist
- * @fidlist: list of FIDs currently associated with this dentry
- *
- * This structure defines the 9p private data associated with
- * a particular dentry.  In particular, this private data is used
- * to lookup which 9P FID handle should be used for a particular VFS
- * operation.  FID handles are associated with dentries instead of
- * inodes in order to more closely map functionality to the Plan 9
- * expected behavior for FID reclaimation and tracking.
- *
- * See Also: Mapping FIDs to Linux VFS model in
- * Design and Implementation of the Linux 9P File System documentation
- */
-struct v9fs_dentry {
-	spinlock_t lock; /* protect fidlist */
-	struct list_head fidlist;
-};
-
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
-int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
 #endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index c82b017f51f..6894b085f0e 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,6 +23,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -54,7 +56,7 @@ enum {
 	/* Options that take no arguments */
 	Opt_nodevmap,
 	/* Cache options */
-	Opt_cache_loose, Opt_fscache,
+	Opt_cache_loose, Opt_fscache, Opt_mmap,
 	/* Access options */
 	Opt_access, Opt_posixacl,
 	/* Error token */
@@ -72,12 +74,35 @@ static const match_table_t tokens = {
 	{Opt_cache, "cache=%s"},
 	{Opt_cache_loose, "loose"},
 	{Opt_fscache, "fscache"},
+	{Opt_mmap, "mmap"},
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_posixacl, "posixacl"},
 	{Opt_err, NULL}
 };
 
+/* Interpret mount options for cache mode */
+static int get_cache_mode(char *s)
+{
+	int version = -EINVAL;
+
+	if (!strcmp(s, "loose")) {
+		version = CACHE_LOOSE;
+		p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
+	} else if (!strcmp(s, "fscache")) {
+		version = CACHE_FSCACHE;
+		p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
+	} else if (!strcmp(s, "mmap")) {
+		version = CACHE_MMAP;
+		p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
+	} else if (!strcmp(s, "none")) {
+		version = CACHE_NONE;
+		p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
+	} else
+		pr_info("Unknown Cache mode %s\n", s);
+	return version;
+}
+
 /**
  * v9fs_parse_options - parse mount options into session structure
  * @v9ses: existing v9fs session information
@@ -97,7 +122,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	/* setup defaults */
 	v9ses->afid = ~0;
 	v9ses->debug = 0;
-	v9ses->cache = 0;
+	v9ses->cache = CACHE_NONE;
 #ifdef CONFIG_9P_FSCACHE
 	v9ses->cachetag = NULL;
 #endif
@@ -113,21 +138,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	options = tmp_options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
+		int token, r;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
-		if (token < Opt_uname) {
-			int r = match_int(&args[0], &option);
+		switch (token) {
+		case Opt_debug:
+			r = match_int(&args[0], &option);
 			if (r < 0) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					"integer field, but no integer?\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
 				ret = r;
 				continue;
 			}
-		}
-		switch (token) {
-		case Opt_debug:
 			v9ses->debug = option;
 #ifdef CONFIG_NET_9P_DEBUG
 			p9_debug_level = option;
@@ -135,19 +158,62 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			break;
 
 		case Opt_dfltuid:
-			v9ses->dfltuid = option;
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			v9ses->dfltuid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(v9ses->dfltuid)) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "uid field, but not a uid?\n");
+				ret = -EINVAL;
+				continue;
+			}
 			break;
 		case Opt_dfltgid:
-			v9ses->dfltgid = option;
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			v9ses->dfltgid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(v9ses->dfltgid)) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "gid field, but not a gid?\n");
+				ret = -EINVAL;
+				continue;
+			}
 			break;
 		case Opt_afid:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
 			v9ses->afid = option;
 			break;
 		case Opt_uname:
-			match_strlcpy(v9ses->uname, &args[0], PATH_MAX);
+			kfree(v9ses->uname);
+			v9ses->uname = match_strdup(&args[0]);
+			if (!v9ses->uname) {
+				ret = -ENOMEM;
+				goto free_and_return;
+			}
 			break;
 		case Opt_remotename:
-			match_strlcpy(v9ses->aname, &args[0], PATH_MAX);
+			kfree(v9ses->aname);
+			v9ses->aname = match_strdup(&args[0]);
+			if (!v9ses->aname) {
+				ret = -ENOMEM;
+				goto free_and_return;
+			}
 			break;
 		case Opt_nodevmap:
 			v9ses->nodev = 1;
@@ -158,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 		case Opt_fscache:
 			v9ses->cache = CACHE_FSCACHE;
 			break;
+		case Opt_mmap:
+			v9ses->cache = CACHE_MMAP;
+			break;
 		case Opt_cachetag:
 #ifdef CONFIG_9P_FSCACHE
 			v9ses->cachetag = match_strdup(&args[0]);
@@ -167,17 +236,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			s = match_strdup(&args[0]);
 			if (!s) {
 				ret = -ENOMEM;
-				P9_DPRINTK(P9_DEBUG_ERROR,
-				  "problem allocating copy of cache arg\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of cache arg\n");
+				goto free_and_return;
+			}
+			ret = get_cache_mode(s);
+			if (ret == -EINVAL) {
+				kfree(s);
 				goto free_and_return;
 			}
 
-			if (strcmp(s, "loose") == 0)
-				v9ses->cache = CACHE_LOOSE;
-			else if (strcmp(s, "fscache") == 0)
-				v9ses->cache = CACHE_FSCACHE;
-			else
-				v9ses->cache = CACHE_NONE;
+			v9ses->cache = ret;
 			kfree(s);
 			break;
 
@@ -185,8 +254,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			s = match_strdup(&args[0]);
 			if (!s) {
 				ret = -ENOMEM;
-				P9_DPRINTK(P9_DEBUG_ERROR,
-				  "problem allocating copy of access arg\n");
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of access arg\n");
 				goto free_and_return;
 			}
 
@@ -198,11 +267,25 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			else if (strcmp(s, "client") == 0) {
 				v9ses->flags |= V9FS_ACCESS_CLIENT;
 			} else {
+				uid_t uid;
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
-				v9ses->uid = simple_strtoul(s, &e, 10);
-				if (*e != '\0')
-					v9ses->uid = ~0;
+				uid = simple_strtoul(s, &e, 10);
+				if (*e != '\0') {
+					ret = -EINVAL;
+					pr_info("Unknown access argument %s\n",
+						s);
+					kfree(s);
+					goto free_and_return;
+				}
+				v9ses->uid = make_kuid(current_user_ns(), uid);
+				if (!uid_valid(v9ses->uid)) {
+					ret = -EINVAL;
+					pr_info("Uknown uid %s\n", s);
+					kfree(s);
+					goto free_and_return;
+				}
 			}
+
 			kfree(s);
 			break;
 
@@ -210,9 +293,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #ifdef CONFIG_9P_FS_POSIX_ACL
 			v9ses->flags |= V9FS_POSIX_ACL;
 #else
-			P9_DPRINTK(P9_DEBUG_ERROR,
-					"Not defined CONFIG_9P_FS_POSIX_ACL. "
-					"Ignoring posixacl option\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
 #endif
 			break;
 
@@ -242,21 +324,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	struct p9_fid *fid;
 	int rc;
 
-	v9ses->uname = __getname();
+	v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
 	if (!v9ses->uname)
 		return ERR_PTR(-ENOMEM);
 
-	v9ses->aname = __getname();
+	v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
 	if (!v9ses->aname) {
-		__putname(v9ses->uname);
+		kfree(v9ses->uname);
 		return ERR_PTR(-ENOMEM);
 	}
 	init_rwsem(&v9ses->rename_sem);
 
 	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
 	if (rc) {
-		__putname(v9ses->aname);
-		__putname(v9ses->uname);
+		kfree(v9ses->aname);
+		kfree(v9ses->uname);
 		return ERR_PTR(rc);
 	}
 
@@ -264,9 +346,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	list_add(&v9ses->slist, &v9fs_sessionlist);
 	spin_unlock(&v9fs_sessionlist_lock);
 
-	strcpy(v9ses->uname, V9FS_DEFUSER);
-	strcpy(v9ses->aname, V9FS_DEFANAME);
-	v9ses->uid = ~0;
+	v9ses->uid = INVALID_UID;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
 
@@ -274,7 +354,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (IS_ERR(v9ses->clnt)) {
 		retval = PTR_ERR(v9ses->clnt);
 		v9ses->clnt = NULL;
-		P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
 		goto error;
 	}
 
@@ -311,7 +391,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 		v9ses->flags &= ~V9FS_ACCESS_MASK;
 		v9ses->flags |= V9FS_ACCESS_ANY;
-		v9ses->uid = ~0;
+		v9ses->uid = INVALID_UID;
 	}
 	if (!v9fs_proto_dotl(v9ses) ||
 		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
@@ -322,19 +402,19 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		v9ses->flags &= ~V9FS_ACL_MASK;
 	}
 
-	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
+	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
 							v9ses->aname);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
 		fid = NULL;
-		P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
+		p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
 		goto error;
 	}
 
 	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
 		fid->uid = v9ses->uid;
 	else
-		fid->uid = ~0;
+		fid->uid = INVALID_UID;
 
 #ifdef CONFIG_9P_FSCACHE
 	/* register the session for caching */
@@ -367,8 +447,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 		kfree(v9ses->cachetag);
 	}
 #endif
-	__putname(v9ses->uname);
-	__putname(v9ses->aname);
+	kfree(v9ses->uname);
+	kfree(v9ses->aname);
 
 	bdi_destroy(&v9ses->bdi);
 
@@ -385,7 +465,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
  */
 
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
-	P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+	p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
 	p9_client_disconnect(v9ses->clnt);
 }
 
@@ -398,7 +478,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
 {
-	P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+	p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
 	p9_client_begin_disconnect(v9ses->clnt);
 }
 
@@ -457,7 +537,7 @@ static struct attribute_group v9fs_attr_group = {
  *
  */
 
-static int v9fs_sysfs_init(void)
+static int __init v9fs_sysfs_init(void)
 {
 	v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
 	if (!v9fs_kobj)
@@ -487,8 +567,8 @@ static void v9fs_inode_init_once(void *foo)
 	struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
 #ifdef CONFIG_9P_FSCACHE
 	v9inode->fscache = NULL;
-	v9inode->fscache_key = NULL;
 #endif
+	memset(&v9inode->qid, 0, sizeof(v9inode->qid));
 	inode_init_once(&v9inode->vfs_inode);
 }
 
@@ -515,6 +595,11 @@ static int v9fs_init_inode_cache(void)
  */
 static void v9fs_destroy_inode_cache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(v9fs_inode_cache);
 }
 
@@ -525,10 +610,11 @@ static int v9fs_cache_register(void)
 	if (ret < 0)
 		return ret;
 #ifdef CONFIG_9P_FSCACHE
-	return fscache_register_netfs(&v9fs_cache_netfs);
-#else
-	return ret;
+	ret = fscache_register_netfs(&v9fs_cache_netfs);
+	if (ret < 0)
+		v9fs_destroy_inode_cache();
 #endif
+	return ret;
 }
 
 static void v9fs_cache_unregister(void)
@@ -547,23 +633,23 @@ static void v9fs_cache_unregister(void)
 static int __init init_v9fs(void)
 {
 	int err;
-	printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
+	pr_info("Installing v9fs 9p2000 file system support\n");
 	/* TODO: Setup list of registered trasnport modules */
-	err = register_filesystem(&v9fs_fs_type);
-	if (err < 0) {
-		printk(KERN_ERR "Failed to register filesystem\n");
-		return err;
-	}
 
 	err = v9fs_cache_register();
 	if (err < 0) {
-		printk(KERN_ERR "Failed to register v9fs for caching\n");
-		goto out_fs_unreg;
+		pr_err("Failed to register v9fs for caching\n");
+		return err;
 	}
 
 	err = v9fs_sysfs_init();
 	if (err < 0) {
-		printk(KERN_ERR "Failed to register with sysfs\n");
+		pr_err("Failed to register with sysfs\n");
+		goto out_cache;
+	}
+	err = register_filesystem(&v9fs_fs_type);
+	if (err < 0) {
+		pr_err("Failed to register filesystem\n");
 		goto out_sysfs_cleanup;
 	}
 
@@ -572,8 +658,8 @@ static int __init init_v9fs(void)
 out_sysfs_cleanup:
 	v9fs_sysfs_cleanup();
 
-out_fs_unreg:
-	unregister_filesystem(&v9fs_fs_type);
+out_cache:
+	v9fs_cache_unregister();
 
 	return err;
 }
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 9665c2b840e..099c7712631 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -64,6 +64,7 @@ enum p9_session_flags {
 
 enum p9_cache_modes {
 	CACHE_NONE,
+	CACHE_MMAP,
 	CACHE_LOOSE,
 	CACHE_FSCACHE,
 };
@@ -109,14 +110,13 @@ struct v9fs_session_info {
 	char *uname;		/* user name to mount as */
 	char *aname;		/* name of remote hierarchy being mounted */
 	unsigned int maxdata;	/* max data for client interface */
-	unsigned int dfltuid;	/* default uid/muid for legacy support */
-	unsigned int dfltgid;	/* default gid for legacy support */
-	u32 uid;		/* if ACCESS_SINGLE, the uid that has access */
+	kuid_t dfltuid;		/* default uid/muid for legacy support */
+	kgid_t dfltgid;		/* default gid for legacy support */
+	kuid_t uid;		/* if ACCESS_SINGLE, the uid that has access */
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct backing_dev_info bdi;
 	struct rw_semaphore rename_sem;
-	struct p9_fid *root_fid; /* Used for file system sync */
 };
 
 /* cache_validity flags */
@@ -126,8 +126,8 @@ struct v9fs_inode {
 #ifdef CONFIG_9P_FSCACHE
 	spinlock_t fscache_lock;
 	struct fscache_cookie *fscache;
-	struct p9_qid *fscache_key;
 #endif
+	struct p9_qid qid;
 	unsigned int cache_validity;
 	struct p9_fid *writeback_fid;
 	struct mutex v_mutex;
@@ -145,7 +145,7 @@ extern void v9fs_session_close(struct v9fs_session_info *v9ses);
 extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
-			struct nameidata *nameidata);
+			unsigned int flags);
 extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -154,20 +154,20 @@ extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
 			void *p);
 extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
 					 struct p9_fid *fid,
-					 struct super_block *sb);
+					 struct super_block *sb, int new);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
 extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
 					      struct p9_fid *fid,
-					      struct super_block *sb);
+					      struct super_block *sb, int new);
 
 /* other default globals */
 #define V9FS_PORT	564
 #define V9FS_DEFUSER	"nobody"
 #define V9FS_DEFANAME	""
-#define V9FS_DEFUID	(-2)
-#define V9FS_DEFGID	(-2)
+#define V9FS_DEFUID	KUIDT_INIT(-2)
+#define V9FS_DEFGID	KGIDT_INIT(-2)
 
 static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
 {
@@ -202,8 +202,27 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 			struct super_block *sb)
 {
 	if (v9fs_proto_dotl(v9ses))
-		return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
 	else
-		return v9fs_inode_from_fid(v9ses, fid, sb);
+		return v9fs_inode_from_fid(v9ses, fid, sb, 0);
 }
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			    struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+}
+
 #endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 4014160903a..b83ebfbf3fd 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -50,13 +50,15 @@ extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 extern const struct file_operations v9fs_cached_file_operations;
 extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern const struct file_operations v9fs_mmap_file_operations;
+extern const struct file_operations v9fs_mmap_file_operations_dotl;
 extern struct kmem_cache *v9fs_inode_cache;
 
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, int mode);
+		    struct inode *inode, umode_t mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -70,7 +72,8 @@ ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
-int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
+			 int datasync);
 ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
 				 const char __user *, size_t, loff_t *, int);
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
@@ -82,4 +85,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
 	v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
 	return;
 }
+
+int v9fs_open_to_dotl_flags(int flags);
 #endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 2524e4cbb8e..cc1cfae726b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -56,7 +57,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
 	struct inode *inode;
 
 	inode = page->mapping->host;
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 
 	BUG_ON(!PageLocked(page));
 
@@ -116,14 +117,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
 	struct inode *inode;
 
 	inode = mapping->host;
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
 
 	ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
 	if (ret == 0)
 		return ret;
 
 	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
-	P9_DPRINTK(P9_DEBUG_VFS, "  = %d\n", ret);
+	p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
 	return ret;
 }
 
@@ -147,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
  * @offset: offset in the page
  */
 
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	/*
 	 * If called with zero offset, we should release
 	 * the private state assocated with the page
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		v9fs_fscache_invalidate_page(page);
 }
 
@@ -200,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int retval;
 
+	p9_debug(P9_DEBUG_VFS, "page %p\n", page);
+
 	retval = v9fs_vfs_writepage_locked(page);
 	if (retval < 0) {
 		if (retval == -EAGAIN) {
@@ -255,18 +259,16 @@ static int v9fs_launder_page(struct page *page)
  *
  */
 static ssize_t
-v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-	       loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
 	/*
 	 * FIXME
 	 * Now that we do caching with cache mode enabled, We need
 	 * to support direct IO
 	 */
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
-			"off/no(%lld/%lu) EINVAL\n",
-			iocb->ki_filp->f_path.dentry->d_name.name,
-			(long long) pos, nr_segs);
+	p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
+		 iocb->ki_filp->f_path.dentry->d_name.name,
+		 (long long)pos, iter->nr_segs);
 
 	return -EINVAL;
 }
@@ -281,6 +283,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = mapping->host;
 
+
+	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
 	v9inode = V9FS_I(inode);
 start:
 	page = grab_cache_page_write_begin(mapping, index, flags);
@@ -311,6 +316,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 	loff_t last_pos = pos + copied;
 	struct inode *inode = page->mapping->host;
 
+	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
 	if (unlikely(copied < len)) {
 		/*
 		 * zero out the rest of the area
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index b6a3b9f7fe4..b03dd23feda 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -43,31 +43,14 @@
 #include "fid.h"
 
 /**
- * v9fs_dentry_delete - called when dentry refcount equals 0
- * @dentry:  dentry in question
- *
- * By returning 1 here we should remove cacheing of unused
- * dentry components.
- *
- */
-
-static int v9fs_dentry_delete(const struct dentry *dentry)
-{
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-									dentry);
-
-	return 1;
-}
-
-/**
  * v9fs_cached_dentry_delete - called when dentry refcount equals 0
  * @dentry:  dentry in question
  *
  */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-		   dentry->d_name.name, dentry);
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		 dentry->d_name.name, dentry);
 
 	/* Don't cache negative dentries */
 	if (!dentry->d_inode)
@@ -83,30 +66,21 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 
 static void v9fs_dentry_release(struct dentry *dentry)
 {
-	struct v9fs_dentry *dent;
-	struct p9_fid *temp, *current_fid;
-
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-									dentry);
-	dent = dentry->d_fsdata;
-	if (dent) {
-		list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
-									dlist) {
-			p9_client_clunk(current_fid);
-		}
-
-		kfree(dent);
-		dentry->d_fsdata = NULL;
-	}
+	struct hlist_node *p, *n;
+	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		 dentry->d_name.name, dentry);
+	hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+		p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+	dentry->d_fsdata = NULL;
 }
 
-static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct p9_fid *fid;
 	struct inode *inode;
 	struct v9fs_inode *v9inode;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = dentry->d_inode;
@@ -126,7 +100,9 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 			retval = v9fs_refresh_inode_dotl(fid, inode);
 		else
 			retval = v9fs_refresh_inode(fid, inode);
-		if (retval <= 0)
+		if (retval == -ENOENT)
+			return 0;
+		if (retval < 0)
 			return retval;
 	}
 out_valid:
@@ -135,11 +111,12 @@ out_valid:
 
 const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_revalidate = v9fs_lookup_revalidate,
+	.d_weak_revalidate = v9fs_lookup_revalidate,
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
 
 const struct dentry_operations v9fs_dentry_operations = {
-	.d_delete = v9fs_dentry_delete,
+	.d_delete = always_delete_dentry,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9..0b3bfa303dd 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -42,7 +42,6 @@
 
 /**
  * struct p9_rdir - readdir accounting
- * @mutex: mutex protecting readdir
  * @head: start offset of current dirread buffer
  * @tail: end offset of current dirread buffer
  * @buf: dirread buffer
@@ -52,10 +51,9 @@
  */
 
 struct p9_rdir {
-	struct mutex mutex;
 	int head;
 	int tail;
-	uint8_t *buf;
+	uint8_t buf[];
 };
 
 /**
@@ -93,46 +91,24 @@ static void p9stat_init(struct p9_wstat *stbuf)
  *
  */
 
-static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 {
-	struct p9_rdir *rdir;
-	struct p9_fid *fid;
-	int err = 0;
-
-	fid = filp->private_data;
-	if (!fid->rdir) {
-		rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
-
-		if (rdir == NULL) {
-			err = -ENOMEM;
-			goto exit;
-		}
-		spin_lock(&filp->f_dentry->d_lock);
-		if (!fid->rdir) {
-			rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
-			mutex_init(&rdir->mutex);
-			rdir->head = rdir->tail = 0;
-			fid->rdir = (void *) rdir;
-			rdir = NULL;
-		}
-		spin_unlock(&filp->f_dentry->d_lock);
-		kfree(rdir);
-	}
-exit:
-	return err;
+	struct p9_fid *fid = filp->private_data;
+	if (!fid->rdir)
+		fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+	return fid->rdir;
 }
 
 /**
- * v9fs_dir_readdir - read a directory
- * @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * v9fs_dir_readdir - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
  *
  */
 
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	int over;
+	bool over;
 	struct p9_wstat st;
 	int err = 0;
 	struct p9_fid *fid;
@@ -140,100 +116,77 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int reclen = 0;
 	struct p9_rdir *rdir;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+	fid = file->private_data;
 
 	buflen = fid->clnt->msize - P9_IOHDRSZ;
 
-	err = v9fs_alloc_rdir_buf(filp, buflen);
-	if (err)
-		goto exit;
-	rdir = (struct p9_rdir *) fid->rdir;
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
+	if (!rdir)
+		return -ENOMEM;
 
-	err = mutex_lock_interruptible(&rdir->mutex);
-	if (err)
-		return err;
-	while (err == 0) {
+	while (1) {
 		if (rdir->tail == rdir->head) {
-			err = v9fs_file_readn(filp, rdir->buf, NULL,
-							buflen, filp->f_pos);
+			err = v9fs_file_readn(file, rdir->buf, NULL,
+							buflen, ctx->pos);
 			if (err <= 0)
-				goto unlock_and_exit;
+				return err;
 
 			rdir->head = 0;
 			rdir->tail = err;
 		}
 		while (rdir->head < rdir->tail) {
 			p9stat_init(&st);
-			err = p9stat_read(rdir->buf + rdir->head,
-						rdir->tail - rdir->head, &st,
-						fid->clnt->proto_version);
+			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
+					  rdir->tail - rdir->head, &st);
 			if (err) {
-				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
-				err = -EIO;
+				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
 				p9stat_free(&st);
-				goto unlock_and_exit;
+				return -EIO;
 			}
 			reclen = st.size+2;
 
-			over = filldir(dirent, st.name, strlen(st.name),
-			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
-
+			over = !dir_emit(ctx, st.name, strlen(st.name),
+					 v9fs_qid2ino(&st.qid), dt_type(&st));
 			p9stat_free(&st);
+			if (over)
+				return 0;
 
-			if (over) {
-				err = 0;
-				goto unlock_and_exit;
-			}
 			rdir->head += reclen;
-			filp->f_pos += reclen;
+			ctx->pos += reclen;
 		}
 	}
-
-unlock_and_exit:
-	mutex_unlock(&rdir->mutex);
-exit:
-	return err;
 }
 
 /**
- * v9fs_dir_readdir_dotl - read a directory
- * @filp: opened file structure
- * @dirent: buffer to fill dirent structures
- * @filldir: function to populate dirent structures
+ * v9fs_dir_readdir_dotl - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
  *
  */
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
-						filldir_t filldir)
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
 {
-	int over;
 	int err = 0;
 	struct p9_fid *fid;
 	int buflen;
 	struct p9_rdir *rdir;
 	struct p9_dirent curdirent;
-	u64 oldoffset = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+	fid = file->private_data;
 
 	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
 
-	err = v9fs_alloc_rdir_buf(filp, buflen);
-	if (err)
-		goto exit;
-	rdir = (struct p9_rdir *) fid->rdir;
-
-	err = mutex_lock_interruptible(&rdir->mutex);
-	if (err)
-		return err;
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
+	if (!rdir)
+		return -ENOMEM;
 
-	while (err == 0) {
+	while (1) {
 		if (rdir->tail == rdir->head) {
 			err = p9_client_readdir(fid, rdir->buf, buflen,
-								filp->f_pos);
+						ctx->pos);
 			if (err <= 0)
-				goto unlock_and_exit;
+				return err;
 
 			rdir->head = 0;
 			rdir->tail = err;
@@ -241,42 +194,24 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 
 		while (rdir->head < rdir->tail) {
 
-			err = p9dirent_read(rdir->buf + rdir->head,
-						rdir->tail - rdir->head,
-						&curdirent,
-						fid->clnt->proto_version);
+			err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
+					    rdir->tail - rdir->head,
+					    &curdirent);
 			if (err < 0) {
-				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
-				err = -EIO;
-				goto unlock_and_exit;
+				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
+				return -EIO;
 			}
 
-			/* d_off in dirent structure tracks the offset into
-			 * the next dirent in the dir. However, filldir()
-			 * expects offset into the current dirent. Hence
-			 * while calling filldir send the offset from the
-			 * previous dirent structure.
-			 */
-			over = filldir(dirent, curdirent.d_name,
-					strlen(curdirent.d_name),
-					oldoffset, v9fs_qid2ino(&curdirent.qid),
-					curdirent.d_type);
-			oldoffset = curdirent.d_off;
-
-			if (over) {
-				err = 0;
-				goto unlock_and_exit;
-			}
+			if (!dir_emit(ctx, curdirent.d_name,
+				      strlen(curdirent.d_name),
+				      v9fs_qid2ino(&curdirent.qid),
+				      curdirent.d_type))
+				return 0;
 
-			filp->f_pos = curdirent.d_off;
+			ctx->pos = curdirent.d_off;
 			rdir->head += err;
 		}
 	}
-
-unlock_and_exit:
-	mutex_unlock(&rdir->mutex);
-exit:
-	return err;
 }
 
 
@@ -292,9 +227,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	struct p9_fid *fid;
 
 	fid = filp->private_data;
-	P9_DPRINTK(P9_DEBUG_VFS,
-			"v9fs_dir_release: inode: %p filp: %p fid: %d\n",
-			inode, filp, fid ? fid->fid : -1);
+	p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
+		 inode, filp, fid ? fid->fid : -1);
 	if (fid)
 		p9_client_clunk(fid);
 	return 0;
@@ -303,7 +237,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 const struct file_operations v9fs_dir_operations = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir,
+	.iterate = v9fs_dir_readdir,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 };
@@ -311,7 +245,7 @@ const struct file_operations v9fs_dir_operations = {
 const struct file_operations v9fs_dir_operations_dotl = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir_dotl,
+	.iterate = v9fs_dir_readdir_dotl,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
         .fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index ffed55817f0..520c11c2dcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -45,6 +45,7 @@
 #include "cache.h"
 
 static const struct vm_operations_struct v9fs_file_vm_ops;
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
 
 /**
  * v9fs_file_open - open a file (or directory)
@@ -61,11 +62,11 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	struct p9_fid *fid;
 	int omode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
 	v9inode = V9FS_I(inode);
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9fs_proto_dotl(v9ses))
-		omode = file->f_flags;
+		omode = v9fs_open_to_dotl_flags(file->f_flags);
 	else
 		omode = v9fs_uflags2omode(file->f_flags,
 					v9fs_proto_dotu(v9ses));
@@ -80,10 +81,6 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			p9_client_clunk(fid);
 			return err;
 		}
-		if (file->f_flags & O_TRUNC) {
-			i_size_write(inode, 0);
-			inode->i_blocks = 0;
-		}
 		if ((file->f_flags & O_APPEND) &&
 			(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
 			generic_file_llseek(file, 0, SEEK_END);
@@ -91,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	file->private_data = fid;
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid &&
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
 	    ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -109,10 +107,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		v9inode->writeback_fid = (void *) fid;
 	}
 	mutex_unlock(&v9inode->v_mutex);
-#ifdef CONFIG_9P_FSCACHE
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		v9fs_cache_inode_set_cookie(inode, file);
-#endif
 	return 0;
 out_error:
 	p9_client_clunk(file->private_data);
@@ -133,9 +129,9 @@ out_error:
 static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	int res = 0;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+	p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -169,14 +165,25 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 	/* convert posix lock to p9 tlock args */
 	memset(&flock, 0, sizeof(flock));
-	flock.type = fl->fl_type;
+	/* map the lock type */
+	switch (fl->fl_type) {
+	case F_RDLCK:
+		flock.type = P9_LOCK_TYPE_RDLCK;
+		break;
+	case F_WRLCK:
+		flock.type = P9_LOCK_TYPE_WRLCK;
+		break;
+	case F_UNLCK:
+		flock.type = P9_LOCK_TYPE_UNLCK;
+		break;
+	}
 	flock.start = fl->fl_start;
 	if (fl->fl_end == OFFSET_MAX)
 		flock.length = 0;
 	else
 		flock.length = fl->fl_end - fl->fl_start + 1;
 	flock.proc_id = fl->fl_pid;
-	flock.client_id = utsname()->nodename;
+	flock.client_id = fid->clnt->name;
 	if (IS_SETLKW(cmd))
 		flock.flags = P9_LOCK_FLAGS_BLOCK;
 
@@ -193,7 +200,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 			break;
 		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
 			break;
-		schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+		if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+			break;
 	}
 
 	/* map 9p status to VFS status */
@@ -245,29 +253,38 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 
 	/* convert posix lock to p9 tgetlock args */
 	memset(&glock, 0, sizeof(glock));
-	glock.type = fl->fl_type;
+	glock.type  = P9_LOCK_TYPE_UNLCK;
 	glock.start = fl->fl_start;
 	if (fl->fl_end == OFFSET_MAX)
 		glock.length = 0;
 	else
 		glock.length = fl->fl_end - fl->fl_start + 1;
 	glock.proc_id = fl->fl_pid;
-	glock.client_id = utsname()->nodename;
+	glock.client_id = fid->clnt->name;
 
 	res = p9_client_getlock_dotl(fid, &glock);
 	if (res < 0)
 		return res;
-	if (glock.type != F_UNLCK) {
-		fl->fl_type = glock.type;
+	/* map 9p lock type to os lock type */
+	switch (glock.type) {
+	case P9_LOCK_TYPE_RDLCK:
+		fl->fl_type = F_RDLCK;
+		break;
+	case P9_LOCK_TYPE_WRLCK:
+		fl->fl_type = F_WRLCK;
+		break;
+	case P9_LOCK_TYPE_UNLCK:
+		fl->fl_type = F_UNLCK;
+		break;
+	}
+	if (glock.type != P9_LOCK_TYPE_UNLCK) {
 		fl->fl_start = glock.start;
 		if (glock.length == 0)
 			fl->fl_end = OFFSET_MAX;
 		else
 			fl->fl_end = glock.start + glock.length - 1;
 		fl->fl_pid = glock.proc_id;
-	} else
-		fl->fl_type = F_UNLCK;
-
+	}
 	return res;
 }
 
@@ -281,11 +298,11 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 
 static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int ret = -ENOLCK;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
-				cmd, fl, filp->f_path.dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+		 filp, cmd, fl, filp->f_path.dentry->d_name.name);
 
 	/* No mandatory locks */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -317,11 +334,11 @@ out_err:
 static int v9fs_file_flock_dotl(struct file *filp, int cmd,
 	struct file_lock *fl)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int ret = -ENOLCK;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
-				cmd, fl, filp->f_path.dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
+		 filp, cmd, fl, filp->f_path.dentry->d_name.name);
 
 	/* No mandatory locks */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -335,9 +352,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
 		invalidate_mapping_pages(&inode->i_data, 0, -1);
 	}
 	/* Convert flock to posix lock */
-	fl->fl_owner = (fl_owner_t)filp;
-	fl->fl_start = 0;
-	fl->fl_end = OFFSET_MAX;
 	fl->fl_flags |= FL_POSIX;
 	fl->fl_flags ^= FL_FLOCK;
 
@@ -364,8 +378,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
 {
 	int n, total, size;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-		   (long long unsigned) offset, count);
+	p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
+		 fid->fid, (long long unsigned)offset, count);
 	n = 0;
 	total = 0;
 	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -423,7 +437,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 	struct p9_fid *fid;
 	size_t size;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
+	p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
 	fid = filp->private_data;
 
 	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -446,14 +460,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 	int n;
 	loff_t i_size;
 	size_t total = 0;
-	struct p9_client *clnt;
 	loff_t origin = *offset;
 	unsigned long pg_start, pg_end;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
-		(int)count, (int)*offset);
+	p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
+		 data, (int)count, (int)*offset);
 
-	clnt = fid->clnt;
 	do {
 		n = p9_client_write(fid, NULL, data+total, origin+total, count);
 		if (n <= 0)
@@ -508,7 +520,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	if (!count)
 		goto out;
 
-	retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+	retval = v9fs_file_write_internal(file_inode(filp),
 					filp->private_data,
 					data, count, &origin, 1);
 	/* update offset on successful write */
@@ -519,41 +531,59 @@ out:
 }
 
 
-static int v9fs_file_fsync(struct file *filp, int datasync)
+static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			   int datasync)
 {
 	struct p9_fid *fid;
+	struct inode *inode = filp->f_mapping->host;
 	struct p9_wstat wstat;
 	int retval;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+	retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (retval)
+		return retval;
+
+	mutex_lock(&inode->i_mutex);
+	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 	v9fs_blank_wstat(&wstat);
 
 	retval = p9_client_wstat(fid, &wstat);
+	mutex_unlock(&inode->i_mutex);
+
 	return retval;
 }
 
-int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
+			 int datasync)
 {
 	struct p9_fid *fid;
+	struct inode *inode = filp->f_mapping->host;
 	int retval;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
-			filp, datasync);
+	retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (retval)
+		return retval;
+
+	mutex_lock(&inode->i_mutex);
+	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 
 	retval = p9_client_fsync(fid, datasync);
+	mutex_unlock(&inode->i_mutex);
+
 	return retval;
 }
 
 static int
-v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int retval;
 
-	retval = generic_file_mmap(file, vma);
+
+	retval = generic_file_mmap(filp, vma);
 	if (!retval)
 		vma->vm_ops = &v9fs_file_vm_ops;
 
@@ -561,16 +591,56 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 }
 
 static int
+v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int retval;
+	struct inode *inode;
+	struct v9fs_inode *v9inode;
+	struct p9_fid *fid;
+
+	inode = file_inode(filp);
+	v9inode = V9FS_I(inode);
+	mutex_lock(&v9inode->v_mutex);
+	if (!v9inode->writeback_fid &&
+	    (vma->vm_flags & VM_WRITE)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during mmap instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		fid = v9fs_writeback_fid(filp->f_path.dentry);
+		if (IS_ERR(fid)) {
+			retval = PTR_ERR(fid);
+			mutex_unlock(&v9inode->v_mutex);
+			return retval;
+		}
+		v9inode->writeback_fid = (void *) fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+
+	retval = generic_file_mmap(filp, vma);
+	if (!retval)
+		vma->vm_ops = &v9fs_mmap_file_vm_ops;
+
+	return retval;
+}
+
+static int
 v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct v9fs_inode *v9inode;
 	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 
-	P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
-		   page, (unsigned long)filp->private_data);
+	p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
+		 page, (unsigned long)filp->private_data);
+
+	/* Update file times before taking page lock */
+	file_update_time(filp);
 
 	v9inode = V9FS_I(inode);
 	/* make sure the cache has finished storing the page */
@@ -579,6 +649,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	lock_page(page);
 	if (page->mapping != inode->i_mapping)
 		goto out_unlock;
+	wait_for_stable_page(page);
 
 	return VM_FAULT_LOCKED;
 out_unlock:
@@ -610,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
 /**
  * v9fs_cached_file_read - read from a file
  * @filp: file pointer to read
- * @udata: user data buffer to read data into
+ * @data: user data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
  *
@@ -621,7 +692,23 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
 {
 	if (filp->f_flags & O_DIRECT)
 		return v9fs_direct_read(filp, data, count, offset);
-	return do_sync_read(filp, data, count, offset);
+	return new_sync_read(filp, data, count, offset);
+}
+
+/**
+ * v9fs_mmap_file_read - read from a file
+ * @filp: file pointer to read
+ * @data: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count,
+		      loff_t *offset)
+{
+	/* TODO: Check if there are dirty pages */
+	return v9fs_file_read(filp, data, count, offset);
 }
 
 static ssize_t
@@ -673,7 +760,7 @@ err_out:
 
 buff_write:
 	mutex_unlock(&inode->i_mutex);
-	return do_sync_write(filp, data, count, offsetp);
+	return new_sync_write(filp, data, count, offsetp);
 }
 
 /**
@@ -691,12 +778,68 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 
 	if (filp->f_flags & O_DIRECT)
 		return v9fs_direct_write(filp, data, count, offset);
-	return do_sync_write(filp, data, count, offset);
+	return new_sync_write(filp, data, count, offset);
+}
+
+
+/**
+ * v9fs_mmap_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_write(struct file *filp, const char __user *data,
+		       size_t count, loff_t *offset)
+{
+	/*
+	 * TODO: invalidate mmaps on filp's inode between
+	 * offset and offset+count
+	 */
+	return v9fs_file_write(filp, data, count, offset);
 }
 
+static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	struct writeback_control wbc = {
+		.nr_to_write = LONG_MAX,
+		.sync_mode = WB_SYNC_ALL,
+		.range_start = vma->vm_pgoff * PAGE_SIZE,
+		 /* absolute end, byte at end included */
+		.range_end = vma->vm_pgoff * PAGE_SIZE +
+			(vma->vm_end - vma->vm_start - 1),
+	};
+
+
+	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
+
+	inode = file_inode(vma->vm_file);
+
+	if (!mapping_cap_writeback_dirty(inode->i_mapping))
+		wbc.nr_to_write = 0;
+
+	might_sleep();
+	sync_inode(inode, &wbc);
+}
+
+
 static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
+};
+
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
+	.close = v9fs_mmap_vm_close,
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = v9fs_vm_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
 };
 
 
@@ -704,8 +847,8 @@ const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_cached_file_read,
 	.write = v9fs_cached_file_write,
-	.aio_read = generic_file_aio_read,
-	.aio_write = generic_file_aio_write,
+	.read_iter = generic_file_read_iter,
+	.write_iter = generic_file_write_iter,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
@@ -717,8 +860,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_cached_file_read,
 	.write = v9fs_cached_file_write,
-	.aio_read = generic_file_aio_read,
-	.aio_write = generic_file_aio_write,
+	.read_iter = generic_file_read_iter,
+	.write_iter = generic_file_write_iter,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock_dotl,
@@ -749,3 +892,26 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.mmap = generic_file_readonly_mmap,
 	.fsync = v9fs_file_fsync_dotl,
 };
+
+const struct file_operations v9fs_mmap_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_mmap_file_read,
+	.write = v9fs_mmap_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = v9fs_mmap_file_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_mmap_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_mmap_file_read,
+	.write = v9fs_mmap_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = v9fs_mmap_file_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c6770319..7fa4f7a7653 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -23,6 +23,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -59,15 +61,13 @@ static const struct inode_operations v9fs_symlink_inode_operations;
  *
  */
 
-static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
 {
 	int res;
 	res = mode & 0777;
 	if (S_ISDIR(mode))
 		res |= P9_DMDIR;
 	if (v9fs_proto_dotu(v9ses)) {
-		if (S_ISLNK(mode))
-			res |= P9_DMSYMLINK;
 		if (v9ses->nodev == 0) {
 			if (S_ISSOCK(mode))
 				res |= P9_DMSOCKET;
@@ -85,25 +85,51 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 			res |= P9_DMSETGID;
 		if ((mode & S_ISVTX) == S_ISVTX)
 			res |= P9_DMSETVTX;
-		if ((mode & P9_DMLINK))
-			res |= P9_DMLINK;
 	}
+	return res;
+}
+
+/**
+ * p9mode2perm- convert plan9 mode bits to unix permission bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ *
+ */
+static int p9mode2perm(struct v9fs_session_info *v9ses,
+		       struct p9_wstat *stat)
+{
+	int res;
+	int mode = stat->mode;
+
+	res = mode & S_IALLUGO;
+	if (v9fs_proto_dotu(v9ses)) {
+		if ((mode & P9_DMSETUID) == P9_DMSETUID)
+			res |= S_ISUID;
 
+		if ((mode & P9_DMSETGID) == P9_DMSETGID)
+			res |= S_ISGID;
+
+		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+			res |= S_ISVTX;
+	}
 	return res;
 }
 
 /**
  * p9mode2unixmode- convert plan9 mode bits to unix mode bits
  * @v9ses: v9fs session information
- * @mode: mode to convert
+ * @stat: p9_wstat from which mode need to be derived
+ * @rdev: major number, minor number in case of device files.
  *
  */
-
-static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
+			       struct p9_wstat *stat, dev_t *rdev)
 {
 	int res;
+	u32 mode = stat->mode;
 
-	res = mode & 0777;
+	*rdev = 0;
+	res = p9mode2perm(v9ses, stat);
 
 	if ((mode & P9_DMDIR) == P9_DMDIR)
 		res |= S_IFDIR;
@@ -116,21 +142,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 		 && (v9ses->nodev == 0))
 		res |= S_IFIFO;
 	else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
-		 && (v9ses->nodev == 0))
-		res |= S_IFBLK;
-	else
-		res |= S_IFREG;
+		 && (v9ses->nodev == 0)) {
+		char type = 0, ext[32];
+		int major = -1, minor = -1;
 
-	if (v9fs_proto_dotu(v9ses)) {
-		if ((mode & P9_DMSETUID) == P9_DMSETUID)
-			res |= S_ISUID;
-
-		if ((mode & P9_DMSETGID) == P9_DMSETGID)
-			res |= S_ISGID;
-
-		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
-			res |= S_ISVTX;
-	}
+		strlcpy(ext, stat->extension, sizeof(ext));
+		sscanf(ext, "%c %i %i", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			res |= S_IFCHR;
+			break;
+		case 'b':
+			res |= S_IFBLK;
+			break;
+		default:
+			p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
+				 type, stat->extension);
+		};
+		*rdev = MKDEV(major, minor);
+	} else
+		res |= S_IFREG;
 
 	return res;
 }
@@ -161,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended)
 		break;
 	}
 
-	if (uflags & O_TRUNC)
-		ret |= P9_OTRUNC;
-
 	if (extended) {
 		if (uflags & O_EXCL)
 			ret |= P9_OEXCL;
@@ -197,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 	wstat->uid = NULL;
 	wstat->gid = NULL;
 	wstat->muid = NULL;
-	wstat->n_uid = ~0;
-	wstat->n_gid = ~0;
-	wstat->n_muid = ~0;
+	wstat->n_uid = INVALID_UID;
+	wstat->n_gid = INVALID_GID;
+	wstat->n_muid = INVALID_UID;
 	wstat->extension = NULL;
 }
 
@@ -216,7 +244,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 		return NULL;
 #ifdef CONFIG_9P_FSCACHE
 	v9inode->fscache = NULL;
-	v9inode->fscache_key = NULL;
 	spin_lock_init(&v9inode->fscache_lock);
 #endif
 	v9inode->writeback_fid = NULL;
@@ -233,7 +260,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 static void v9fs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 
@@ -243,13 +269,13 @@ void v9fs_destroy_inode(struct inode *inode)
 }
 
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, int mode)
+		    struct inode *inode, umode_t mode, dev_t rdev)
 {
 	int err = 0;
 
 	inode_init_owner(inode, NULL, mode);
 	inode->i_blocks = 0;
-	inode->i_rdev = 0;
+	inode->i_rdev = rdev;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_mapping->a_ops = &v9fs_addr_operations;
 
@@ -260,13 +286,11 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	case S_IFSOCK:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			inode->i_fop = &v9fs_file_operations_dotl;
 		} else if (v9fs_proto_dotu(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations;
-			inode->i_fop = &v9fs_file_operations;
 		} else {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				   "special files without extended mode\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "special files without extended mode\n");
 			err = -EINVAL;
 			goto error;
 		}
@@ -275,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	case S_IFREG:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			if (v9ses->cache)
+			if (v9ses->cache == CACHE_LOOSE ||
+			    v9ses->cache == CACHE_FSCACHE)
 				inode->i_fop =
 					&v9fs_cached_file_operations_dotl;
+			else if (v9ses->cache == CACHE_MMAP)
+				inode->i_fop = &v9fs_mmap_file_operations_dotl;
 			else
 				inode->i_fop = &v9fs_file_operations_dotl;
 		} else {
 			inode->i_op = &v9fs_file_inode_operations;
-			if (v9ses->cache)
-				inode->i_fop = &v9fs_cached_file_operations;
+			if (v9ses->cache == CACHE_LOOSE ||
+			    v9ses->cache == CACHE_FSCACHE)
+				inode->i_fop =
+					&v9fs_cached_file_operations;
+			else if (v9ses->cache == CACHE_MMAP)
+				inode->i_fop = &v9fs_mmap_file_operations;
 			else
 				inode->i_fop = &v9fs_file_operations;
 		}
@@ -291,8 +322,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		break;
 	case S_IFLNK:
 		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-			P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
-						"legacy protocol.\n");
+			p9_debug(P9_DEBUG_ERROR,
+				 "extended modes used with legacy protocol\n");
 			err = -EINVAL;
 			goto error;
 		}
@@ -319,8 +350,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 
 		break;
 	default:
-		P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
-			   mode, mode & S_IFMT);
+		p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
+			 mode, mode & S_IFMT);
 		err = -EINVAL;
 		goto error;
 	}
@@ -336,20 +367,21 @@ error:
  *
  */
 
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
 {
 	int err;
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+	p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
 
 	inode = new_inode(sb);
 	if (!inode) {
-		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+		pr_warn("%s (%d): Problem allocating inode\n",
+			__func__, task_pid_nr(current));
 		return ERR_PTR(-ENOMEM);
 	}
-	err = v9fs_init_inode(v9ses, inode, mode);
+	err = v9fs_init_inode(v9ses, inode, mode, rdev);
 	if (err) {
 		iput(inode);
 		return ERR_PTR(err);
@@ -419,13 +451,11 @@ void v9fs_evict_inode(struct inode *inode)
 {
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	truncate_inode_pages(inode->i_mapping, 0);
-	end_writeback(inode);
+	truncate_inode_pages_final(inode->i_mapping);
+	clear_inode(inode);
 	filemap_fdatawrite(inode->i_mapping);
 
-#ifdef CONFIG_9P_FSCACHE
 	v9fs_cache_inode_put_cookie(inode);
-#endif
 	/* clunk the fid stashed in writeback_fid */
 	if (v9inode->writeback_fid) {
 		p9_client_clunk(v9inode->writeback_fid);
@@ -433,17 +463,63 @@ void v9fs_evict_inode(struct inode *inode)
 	}
 }
 
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+	int umode;
+	dev_t rdev;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+	return 0;
+}
+
+static int v9fs_set_inode(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	return 0;
+}
+
 static struct inode *v9fs_qid_iget(struct super_block *sb,
 				   struct p9_qid *qid,
-				   struct p9_wstat *st)
+				   struct p9_wstat *st,
+				   int new)
 {
-	int retval, umode;
+	dev_t rdev;
+	int retval;
+	umode_t umode;
 	unsigned long i_ino;
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode;
+	else
+		test = v9fs_test_inode;
 
 	i_ino = v9fs_qid2ino(qid);
-	inode = iget_locked(sb, i_ino);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
@@ -453,16 +529,14 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
-	umode = p9mode2unixmode(v9ses, st->mode);
-	retval = v9fs_init_inode(v9ses, inode, umode);
+	inode->i_ino = i_ino;
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	retval = v9fs_init_inode(v9ses, inode, umode, rdev);
 	if (retval)
 		goto error;
 
 	v9fs_stat2inode(st, inode, sb);
-#ifdef CONFIG_9P_FSCACHE
-	v9fs_fscache_set_key(inode, &st->qid);
 	v9fs_cache_inode_get_cookie(inode);
-#endif
 	unlock_new_inode(inode);
 	return inode;
 error:
@@ -474,7 +548,7 @@ error:
 
 struct inode *
 v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-		    struct super_block *sb)
+		    struct super_block *sb, int new)
 {
 	struct p9_wstat *st;
 	struct inode *inode = NULL;
@@ -483,47 +557,73 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
-	inode = v9fs_qid_iget(sb, &st->qid, st);
+	inode = v9fs_qid_iget(sb, &st->qid, st, new);
 	p9stat_free(st);
 	kfree(st);
 	return inode;
 }
 
 /**
+ * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
+ * plan 9 AT flag.
+ * @flags: flags to convert
+ */
+static int v9fs_at_to_dotl_flags(int flags)
+{
+	int rflags = 0;
+	if (flags & AT_REMOVEDIR)
+		rflags |= P9_DOTL_AT_REMOVEDIR;
+	return rflags;
+}
+
+/**
  * v9fs_remove - helper function to remove files and directories
  * @dir: directory inode that is being deleted
- * @file:  dentry that is being deleted
- * @rmdir: removing a directory
+ * @dentry:  dentry that is being deleted
+ * @flags: removing a directory
  *
  */
 
-static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
+static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
 {
-	int retval;
-	struct p9_fid *v9fid;
-	struct inode *file_inode;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
-		rmdir);
+	struct inode *inode;
+	int retval = -EOPNOTSUPP;
+	struct p9_fid *v9fid, *dfid;
+	struct v9fs_session_info *v9ses;
 
-	file_inode = file->d_inode;
-	v9fid = v9fs_fid_clone(file);
-	if (IS_ERR(v9fid))
-		return PTR_ERR(v9fid);
+	p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
+		 dir, dentry, flags);
 
-	retval = p9_client_remove(v9fid);
+	v9ses = v9fs_inode2v9ses(dir);
+	inode = dentry->d_inode;
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		retval = PTR_ERR(dfid);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
+		return retval;
+	}
+	if (v9fs_proto_dotl(v9ses))
+		retval = p9_client_unlinkat(dfid, dentry->d_name.name,
+					    v9fs_at_to_dotl_flags(flags));
+	if (retval == -EOPNOTSUPP) {
+		/* Try the one based on path */
+		v9fid = v9fs_fid_clone(dentry);
+		if (IS_ERR(v9fid))
+			return PTR_ERR(v9fid);
+		retval = p9_client_remove(v9fid);
+	}
 	if (!retval) {
 		/*
 		 * directories on unlink should have zero
 		 * link count
 		 */
-		if (rmdir) {
-			clear_nlink(file_inode);
+		if (flags & AT_REMOVEDIR) {
+			clear_nlink(inode);
 			drop_nlink(dir);
 		} else
-			drop_nlink(file_inode);
+			drop_nlink(inode);
 
-		v9fs_invalidate_inode_attr(file_inode);
+		v9fs_invalidate_inode_attr(inode);
 		v9fs_invalidate_inode_attr(dir);
 	}
 	return retval;
@@ -548,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	struct p9_fid *dfid, *ofid, *fid;
 	struct inode *inode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 
 	err = 0;
 	ofid = NULL;
@@ -557,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		return ERR_PTR(err);
 	}
 
@@ -565,39 +665,40 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	ofid = p9_client_walk(dfid, 0, NULL, 1);
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		return ERR_PTR(err);
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
 		goto error;
 	}
 
-	/* now walk from the parent so we can get unopened fid */
-	fid = p9_client_walk(dfid, 1, &name, 1);
-	if (IS_ERR(fid)) {
-		err = PTR_ERR(fid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		fid = NULL;
-		goto error;
-	}
-
-	/* instantiate inode and assign the unopened fid to the dentry */
-	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
-		goto error;
+	if (!(perm & P9_DMLINK)) {
+		/* now walk from the parent so we can get unopened fid */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			p9_debug(P9_DEBUG_VFS,
+				   "p9_client_walk failed %d\n", err);
+			fid = NULL;
+			goto error;
+		}
+		/*
+		 * instantiate inode and assign the unopened fid to the dentry
+		 */
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			p9_debug(P9_DEBUG_VFS,
+				   "inode creation failed %d\n", err);
+			goto error;
+		}
+		v9fs_fid_add(dentry, fid);
+		d_instantiate(dentry, inode);
 	}
-	d_instantiate(dentry, inode);
-	err = v9fs_fid_add(dentry, fid);
-	if (err < 0)
-		goto error;
-
 	return ofid;
-
 error:
 	if (ofid)
 		p9_client_clunk(ofid);
@@ -609,88 +710,34 @@ error:
 }
 
 /**
- * v9fs_vfs_create - VFS hook to create files
+ * v9fs_vfs_create - VFS hook to create a regular file
+ *
+ * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open().  This is only called
+ * for mknod(2).
+ *
  * @dir: directory inode that is being created
  * @dentry:  dentry that is being deleted
  * @mode: create permissions
- * @nd: path information
  *
  */
 
 static int
-v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
+v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool excl)
 {
-	int err;
-	u32 perm;
-	int flags;
-	struct file *filp;
-	struct v9fs_inode *v9inode;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid, *inode_fid;
-
-	err = 0;
-	fid = NULL;
-	v9ses = v9fs_inode2v9ses(dir);
-	perm = unixmode2p9mode(v9ses, mode);
-	if (nd && nd->flags & LOOKUP_OPEN)
-		flags = nd->intent.open.flags - 1;
-	else
-		flags = O_RDWR;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	u32 perm = unixmode2p9mode(v9ses, mode);
+	struct p9_fid *fid;
 
-	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-				v9fs_uflags2omode(flags,
-						v9fs_proto_dotu(v9ses)));
-	if (IS_ERR(fid)) {
-		err = PTR_ERR(fid);
-		fid = NULL;
-		goto error;
-	}
+	/* P9_OEXCL? */
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_ORDWR);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
 
 	v9fs_invalidate_inode_attr(dir);
-	/* if we are opening a file, assign the open fid to the file */
-	if (nd && nd->flags & LOOKUP_OPEN) {
-		v9inode = V9FS_I(dentry->d_inode);
-		mutex_lock(&v9inode->v_mutex);
-		if (v9ses->cache && !v9inode->writeback_fid &&
-		    ((flags & O_ACCMODE) != O_RDONLY)) {
-			/*
-			 * clone a fid and add it to writeback_fid
-			 * we do it during open time instead of
-			 * page dirty time via write_begin/page_mkwrite
-			 * because we want write after unlink usecase
-			 * to work.
-			 */
-			inode_fid = v9fs_writeback_fid(dentry);
-			if (IS_ERR(inode_fid)) {
-				err = PTR_ERR(inode_fid);
-				mutex_unlock(&v9inode->v_mutex);
-				goto error;
-			}
-			v9inode->writeback_fid = (void *) inode_fid;
-		}
-		mutex_unlock(&v9inode->v_mutex);
-		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-		if (IS_ERR(filp)) {
-			err = PTR_ERR(filp);
-			goto error;
-		}
-
-		filp->private_data = fid;
-#ifdef CONFIG_9P_FSCACHE
-		if (v9ses->cache)
-			v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
-#endif
-	} else
-		p9_client_clunk(fid);
+	p9_client_clunk(fid);
 
 	return 0;
-
-error:
-	if (fid)
-		p9_client_clunk(fid);
-
-	return err;
 }
 
 /**
@@ -701,14 +748,14 @@ error:
  *
  */
 
-static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int err;
 	u32 perm;
 	struct p9_fid *fid;
 	struct v9fs_session_info *v9ses;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 	err = 0;
 	v9ses = v9fs_inode2v9ses(dir);
 	perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -731,27 +778,25 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
  * @dir:  inode that is being walked from
  * @dentry: dentry that is being walked to?
- * @nameidata: path data
+ * @flags: lookup flags (unused)
  *
  */
 
 struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nameidata)
+				      unsigned int flags)
 {
-	struct super_block *sb;
+	struct dentry *res;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *dfid, *fid;
 	struct inode *inode;
 	char *name;
-	int result = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
-		dir, dentry->d_name.name, dentry, nameidata);
+	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
+		 dir, dentry->d_name.name, dentry, flags);
 
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
 	/* We can walk d_parent because we hold the dir->i_mutex */
 	dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -761,36 +806,119 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	name = (char *) dentry->d_name.name;
 	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
-		result = PTR_ERR(fid);
-		if (result == -ENOENT) {
-			inode = NULL;
-			goto inst_out;
+		if (fid == ERR_PTR(-ENOENT)) {
+			d_add(dentry, NULL);
+			return NULL;
 		}
+		return ERR_CAST(fid);
+	}
+	/*
+	 * Make sure we don't use a wrong inode due to parallel
+	 * unlink. For cached mode create calls request for new
+	 * inode. But with cache disabled, lookup should do this.
+	 */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	else
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		p9_client_clunk(fid);
+		return ERR_CAST(inode);
+	}
+	/*
+	 * If we had a rename on the server and a parallel lookup
+	 * for the new name, then make sure we instantiate with
+	 * the new name. ie look up for a/b, while on server somebody
+	 * moved b under k and client parallely did a lookup for
+	 * k/b.
+	 */
+	res = d_materialise_unique(dentry, inode);
+	if (!res)
+		v9fs_fid_add(dentry, fid);
+	else if (!IS_ERR(res))
+		v9fs_fid_add(res, fid);
+	else
+		p9_client_clunk(fid);
+	return res;
+}
 
-		return ERR_PTR(result);
+static int
+v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
+		     struct file *file, unsigned flags, umode_t mode,
+		     int *opened)
+{
+	int err;
+	u32 perm;
+	struct v9fs_inode *v9inode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid, *inode_fid;
+	struct dentry *res = NULL;
+
+	if (d_unhashed(dentry)) {
+		res = v9fs_vfs_lookup(dir, dentry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		if (res)
+			dentry = res;
 	}
 
-	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
-	if (IS_ERR(inode)) {
-		result = PTR_ERR(inode);
-		inode = NULL;
+	/* Only creates */
+	if (!(flags & O_CREAT) || dentry->d_inode)
+		return finish_no_open(file, res);
+
+	err = 0;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	perm = unixmode2p9mode(v9ses, mode);
+	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
+				v9fs_uflags2omode(flags,
+						v9fs_proto_dotu(v9ses)));
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		fid = NULL;
 		goto error;
 	}
 
-	result = v9fs_fid_add(dentry, fid);
-	if (result < 0)
-		goto error_iput;
+	v9fs_invalidate_inode_attr(dir);
+	v9inode = V9FS_I(dentry->d_inode);
+	mutex_lock(&v9inode->v_mutex);
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
+	    ((flags & O_ACCMODE) != O_RDONLY)) {
+		/*
+		 * clone a fid and add it to writeback_fid
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		inode_fid = v9fs_writeback_fid(dentry);
+		if (IS_ERR(inode_fid)) {
+			err = PTR_ERR(inode_fid);
+			mutex_unlock(&v9inode->v_mutex);
+			goto error;
+		}
+		v9inode->writeback_fid = (void *) inode_fid;
+	}
+	mutex_unlock(&v9inode->v_mutex);
+	err = finish_open(file, dentry, generic_file_open, opened);
+	if (err)
+		goto error;
 
-inst_out:
-	d_add(dentry, inode);
-	return NULL;
+	file->private_data = fid;
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		v9fs_cache_inode_set_cookie(dentry->d_inode, file);
 
-error_iput:
-	iput(inode);
-error:
-	p9_client_clunk(fid);
+	*opened |= FILE_CREATED;
+out:
+	dput(res);
+	return err;
 
-	return ERR_PTR(result);
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	goto out;
 }
 
 /**
@@ -814,7 +942,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
-	return v9fs_remove(i, d, 1);
+	return v9fs_remove(i, d, AT_REMOVEDIR);
 }
 
 /**
@@ -839,7 +967,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct p9_fid *newdirfid;
 	struct p9_wstat wstat;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 	retval = 0;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
@@ -862,9 +990,12 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	down_write(&v9ses->rename_sem);
 	if (v9fs_proto_dotl(v9ses)) {
-		retval = p9_client_rename(oldfid, newdirfid,
-					(char *) new_dentry->d_name.name);
-		if (retval != -ENOSYS)
+		retval = p9_client_renameat(olddirfid, old_dentry->d_name.name,
+					    newdirfid, new_dentry->d_name.name);
+		if (retval == -EOPNOTSUPP)
+			retval = p9_client_rename(oldfid, newdirfid,
+						  new_dentry->d_name.name);
+		if (retval != -EOPNOTSUPP)
 			goto clunk_newdir;
 	}
 	if (old_dentry->d_parent != new_dentry->d_parent) {
@@ -872,8 +1003,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 * 9P .u can only handle file rename in the same directory
 		 */
 
-		P9_DPRINTK(P9_DEBUG_ERROR,
-				"old dir and new dir are different\n");
+		p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
 		retval = -EXDEV;
 		goto clunk_newdir;
 	}
@@ -889,11 +1019,6 @@ clunk_newdir:
 				clear_nlink(new_inode);
 			else
 				drop_nlink(new_inode);
-			/*
-			 * Work around vfs rename rehash bug with
-			 * FS_RENAME_DOES_D_MOVE
-			 */
-			v9fs_invalidate_inode_attr(new_inode);
 		}
 		if (S_ISDIR(old_inode->i_mode)) {
 			if (!new_inode)
@@ -929,13 +1054,11 @@ static int
 v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
-	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_wstat *st;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-	err = -EPERM;
+	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		generic_fillattr(dentry->d_inode, stat);
@@ -950,7 +1073,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		return PTR_ERR(st);
 
 	v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
-		generic_fillattr(dentry->d_inode, stat);
+	generic_fillattr(dentry->d_inode, stat);
 
 	p9stat_free(st);
 	kfree(st);
@@ -971,7 +1094,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct p9_fid *fid;
 	struct p9_wstat wstat;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 	retval = inode_change_ok(dentry->d_inode, iattr);
 	if (retval)
 		return retval;
@@ -1034,13 +1157,14 @@ void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	struct super_block *sb)
 {
+	umode_t mode;
 	char ext[32];
 	char tag_name[14];
 	unsigned int i_nlink;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 
 	inode->i_atime.tv_sec = stat->atime;
 	inode->i_mtime.tv_sec = stat->mtime;
@@ -1062,38 +1186,16 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 			 * this even with .u extension. So check
 			 * for non NULL stat->extension
 			 */
-			strncpy(ext, stat->extension, sizeof(ext));
+			strlcpy(ext, stat->extension, sizeof(ext));
 			/* HARDLINKCOUNT %u */
 			sscanf(ext, "%13s %u", tag_name, &i_nlink);
 			if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
-				inode->i_nlink = i_nlink;
+				set_nlink(inode, i_nlink);
 		}
 	}
-	inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
-	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
-		char type = 0;
-		int major = -1;
-		int minor = -1;
-
-		strncpy(ext, stat->extension, sizeof(ext));
-		sscanf(ext, "%c %u %u", &type, &major, &minor);
-		switch (type) {
-		case 'c':
-			inode->i_mode &= ~S_IFBLK;
-			inode->i_mode |= S_IFCHR;
-			break;
-		case 'b':
-			break;
-		default:
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				"Unknown special type %c %s\n", type,
-				stat->extension);
-		};
-		inode->i_rdev = MKDEV(major, minor);
-		init_special_inode(inode, inode->i_mode, inode->i_rdev);
-	} else
-		inode->i_rdev = 0;
-
+	mode = p9mode2perm(v9ses, stat);
+	mode |= inode->i_mode & ~S_IALLUGO;
+	inode->i_mode = mode;
 	i_size_write(inode, stat->length);
 
 	/* not real number of blocks, but 512 byte ones ... */
@@ -1137,7 +1239,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	struct p9_fid *fid;
 	struct p9_wstat *st;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
 	retval = -EPERM;
 	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
@@ -1157,12 +1259,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	}
 
 	/* copy extension buffer into buffer */
-	strncpy(buffer, st->extension, buflen);
+	retval = min(strlen(st->extension)+1, (size_t)buflen);
+	memcpy(buffer, st->extension, retval);
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
+	p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
+		 dentry->d_name.name, st->extension, buflen, buffer);
 
-	retval = strnlen(buffer, buflen);
 done:
 	p9stat_free(st);
 	kfree(st);
@@ -1181,7 +1283,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	int len = 0;
 	char *link = __getname();
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
 
 	if (!link)
 		link = ERR_PTR(-ENOMEM);
@@ -1212,8 +1314,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	char *s = nd_get_link(nd);
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
-		IS_ERR(s) ? "<error>" : s);
+	p9_debug(P9_DEBUG_VFS, " %s %s\n",
+		 dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
 	if (!IS_ERR(s))
 		__putname(s);
 }
@@ -1222,25 +1324,23 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
  * v9fs_vfs_mkspecial - create a special file
  * @dir: inode to create special file in
  * @dentry: dentry to create
- * @mode: mode to create special file
+ * @perm: mode to create special file
  * @extension: 9p2000.u format extension string representing special file
  *
  */
 
 static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
-	int mode, const char *extension)
+	u32 perm, const char *extension)
 {
-	u32 perm;
 	struct p9_fid *fid;
 	struct v9fs_session_info *v9ses;
 
 	v9ses = v9fs_inode2v9ses(dir);
 	if (!v9fs_proto_dotu(v9ses)) {
-		P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
+		p9_debug(P9_DEBUG_ERROR, "not extended\n");
 		return -EPERM;
 	}
 
-	perm = unixmode2p9mode(v9ses, mode);
 	fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm,
 								P9_OREAD);
 	if (IS_ERR(fid))
@@ -1264,10 +1364,10 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 static int
 v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-	P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
-					dentry->d_name.name, symname);
+	p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+		 dir->i_ino, dentry->d_name.name, symname);
 
-	return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
+	return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
 }
 
 /**
@@ -1286,9 +1386,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	char *name;
 	struct p9_fid *oldfid;
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
-		old_dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
+		 dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
 
 	oldfid = v9fs_fid_clone(old_dentry);
 	if (IS_ERR(oldfid))
@@ -1322,14 +1421,16 @@ clunk_fid:
  */
 
 static int
-v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 {
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	int retval;
 	char *name;
+	u32 perm;
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+	p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
+		 dir->i_ino, dentry->d_name.name, mode,
+		 MAJOR(rdev), MINOR(rdev));
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -1351,7 +1452,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 		return -EINVAL;
 	}
 
-	retval = v9fs_vfs_mkspecial(dir, dentry, mode, name);
+	perm = unixmode2p9mode(v9ses, mode);
+	retval = v9fs_vfs_mkspecial(dir, dentry, perm, name);
 	__putname(name);
 
 	return retval;
@@ -1359,6 +1461,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 {
+	int umode;
+	dev_t rdev;
 	loff_t i_size;
 	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses;
@@ -1367,6 +1471,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	st = p9_client_stat(fid);
 	if (IS_ERR(st))
 		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		goto out;
 
 	spin_lock(&inode->i_lock);
 	/*
@@ -1375,9 +1485,10 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	 */
 	i_size = inode->i_size;
 	v9fs_stat2inode(st, inode, inode->i_sb);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		inode->i_size = i_size;
 	spin_unlock(&inode->i_lock);
+out:
 	p9stat_free(st);
 	kfree(st);
 	return 0;
@@ -1386,6 +1497,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
+	.atomic_open = v9fs_vfs_atomic_open,
 	.symlink = v9fs_vfs_symlink,
 	.link = v9fs_vfs_link,
 	.unlink = v9fs_vfs_unlink,
@@ -1400,6 +1512,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 static const struct inode_operations v9fs_dir_inode_operations = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
+	.atomic_open = v9fs_vfs_atomic_open,
 	.unlink = v9fs_vfs_unlink,
 	.mkdir = v9fs_vfs_mkdir,
 	.rmdir = v9fs_vfs_rmdir,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index ffbb113d5f3..1fa85aae24d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -48,7 +48,7 @@
 #include "acl.h"
 
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 		    dev_t rdev);
 
 /**
@@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
  * group of the new file system object.
  */
 
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
 {
 	BUG_ON(dir_inode == NULL);
 
@@ -68,36 +68,63 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
 	return current_fsgid();
 }
 
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		return 0;
+
+	if (inode->i_generation != st->st_gen)
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+	return 0;
+}
 
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
 {
-	struct dentry *dentry;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
 
-	spin_lock(&inode->i_lock);
-	/* Directory should have only one entry. */
-	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-	spin_unlock(&inode->i_lock);
-	return dentry;
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	inode->i_generation = st->st_gen;
+	return 0;
 }
 
 static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 					struct p9_qid *qid,
 					struct p9_fid *fid,
-					struct p9_stat_dotl *st)
+					struct p9_stat_dotl *st,
+					int new)
 {
 	int retval;
 	unsigned long i_ino;
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode_dotl;
+	else
+		test = v9fs_test_inode_dotl;
 
 	i_ino = v9fs_qid2ino(qid);
-	inode = iget_locked(sb, i_ino);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
@@ -107,15 +134,14 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
-	retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+	inode->i_ino = i_ino;
+	retval = v9fs_init_inode(v9ses, inode,
+				 st->st_mode, new_decode_dev(st->st_rdev));
 	if (retval)
 		goto error;
 
 	v9fs_stat2inode_dotl(st, inode);
-#ifdef CONFIG_9P_FSCACHE
-	v9fs_fscache_set_key(inode, &st->qid);
 	v9fs_cache_inode_get_cookie(inode);
-#endif
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto error;
@@ -131,39 +157,95 @@ error:
 
 struct inode *
 v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-			 struct super_block *sb)
+			 struct super_block *sb, int new)
 {
 	struct p9_stat_dotl *st;
 	struct inode *inode = NULL;
 
-	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
-	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
+	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
 	kfree(st);
 	return inode;
 }
 
+struct dotl_openflag_map {
+	int open_flag;
+	int dotl_flag;
+};
+
+static int v9fs_mapped_dotl_flags(int flags)
+{
+	int i;
+	int rflags = 0;
+	struct dotl_openflag_map dotl_oflag_map[] = {
+		{ O_CREAT,	P9_DOTL_CREATE },
+		{ O_EXCL,	P9_DOTL_EXCL },
+		{ O_NOCTTY,	P9_DOTL_NOCTTY },
+		{ O_APPEND,	P9_DOTL_APPEND },
+		{ O_NONBLOCK,	P9_DOTL_NONBLOCK },
+		{ O_DSYNC,	P9_DOTL_DSYNC },
+		{ FASYNC,	P9_DOTL_FASYNC },
+		{ O_DIRECT,	P9_DOTL_DIRECT },
+		{ O_LARGEFILE,	P9_DOTL_LARGEFILE },
+		{ O_DIRECTORY,	P9_DOTL_DIRECTORY },
+		{ O_NOFOLLOW,	P9_DOTL_NOFOLLOW },
+		{ O_NOATIME,	P9_DOTL_NOATIME },
+		{ O_CLOEXEC,	P9_DOTL_CLOEXEC },
+		{ O_SYNC,	P9_DOTL_SYNC},
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
+		if (flags & dotl_oflag_map[i].open_flag)
+			rflags |= dotl_oflag_map[i].dotl_flag;
+	}
+	return rflags;
+}
+
+/**
+ * v9fs_open_to_dotl_flags- convert Linux specific open flags to
+ * plan 9 open flag.
+ * @flags: flags to convert
+ */
+int v9fs_open_to_dotl_flags(int flags)
+{
+	int rflags = 0;
+
+	/*
+	 * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
+	 * and P9_DOTL_NOACCESS
+	 */
+	rflags |= flags & O_ACCMODE;
+	rflags |= v9fs_mapped_dotl_flags(flags);
+
+	return rflags;
+}
+
 /**
  * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
  * @dir: directory inode that is being created
  * @dentry:  dentry that is being deleted
- * @mode: create permissions
- * @nd: path information
+ * @omode: create permissions
  *
  */
 
 static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
-		struct nameidata *nd)
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
+		bool excl)
+{
+	return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+}
+
+static int
+v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned flags, umode_t omode,
+			  int *opened)
 {
 	int err = 0;
-	gid_t gid;
-	int flags;
-	mode_t mode;
+	kgid_t gid;
+	umode_t mode;
 	char *name = NULL;
-	struct file *filp;
 	struct p9_qid qid;
 	struct inode *inode;
 	struct p9_fid *fid = NULL;
@@ -171,36 +253,40 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	struct p9_fid *dfid, *ofid, *inode_fid;
 	struct v9fs_session_info *v9ses;
 	struct posix_acl *pacl = NULL, *dacl = NULL;
+	struct dentry *res = NULL;
 
-	v9ses = v9fs_inode2v9ses(dir);
-	if (nd && nd->flags & LOOKUP_OPEN)
-		flags = nd->intent.open.flags - 1;
-	else {
-		/*
-		 * create call without LOOKUP_OPEN is due
-		 * to mknod of regular files. So use mknod
-		 * operation.
-		 */
-		return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	if (d_unhashed(dentry)) {
+		res = v9fs_vfs_lookup(dir, dentry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		if (res)
+			dentry = res;
 	}
 
+	/* Only creates */
+	if (!(flags & O_CREAT) || dentry->d_inode)
+		return	finish_no_open(file, res);
+
+	v9ses = v9fs_inode2v9ses(dir);
+
 	name = (char *) dentry->d_name.name;
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-			"mode:0x%x\n", name, flags, omode);
+	p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+		 name, flags, omode);
 
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-		return err;
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		goto out;
 	}
 
 	/* clone a fid to use for creation */
 	ofid = p9_client_walk(dfid, 0, NULL, 1);
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		return err;
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		goto out;
 	}
 
 	gid = v9fs_get_fsgid_for_create(dir);
@@ -209,15 +295,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in creat %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+			 err);
 		goto error;
 	}
-	err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
+				    mode, gid, &qid);
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-				"p9_client_open_dotl failed in creat %d\n",
-				err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+			 err);
 		goto error;
 	}
 	v9fs_invalidate_inode_attr(dir);
@@ -226,27 +312,26 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
 	}
-	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
-	d_instantiate(dentry, inode);
-	err = v9fs_fid_add(dentry, fid);
-	if (err < 0)
-		goto error;
-
 	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
+	v9fs_set_create_acl(inode, fid, dacl, pacl);
+
+	v9fs_fid_add(dentry, fid);
+	d_instantiate(dentry, inode);
 
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid &&
+	if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+	    !v9inode->writeback_fid &&
 	    ((flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -259,55 +344,56 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		if (IS_ERR(inode_fid)) {
 			err = PTR_ERR(inode_fid);
 			mutex_unlock(&v9inode->v_mutex);
-			goto error;
+			goto err_clunk_old_fid;
 		}
 		v9inode->writeback_fid = (void *) inode_fid;
 	}
 	mutex_unlock(&v9inode->v_mutex);
 	/* Since we are opening a file, assign the open fid to the file */
-	filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-	if (IS_ERR(filp)) {
-		p9_client_clunk(ofid);
-		return PTR_ERR(filp);
-	}
-	filp->private_data = ofid;
-#ifdef CONFIG_9P_FSCACHE
-	if (v9ses->cache)
-		v9fs_cache_inode_set_cookie(inode, filp);
-#endif
-	return 0;
+	err = finish_open(file, dentry, generic_file_open, opened);
+	if (err)
+		goto err_clunk_old_fid;
+	file->private_data = ofid;
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		v9fs_cache_inode_set_cookie(inode, file);
+	*opened |= FILE_CREATED;
+out:
+	v9fs_put_acl(dacl, pacl);
+	dput(res);
+	return err;
 
 error:
-	if (ofid)
-		p9_client_clunk(ofid);
 	if (fid)
 		p9_client_clunk(fid);
-	return err;
+err_clunk_old_fid:
+	if (ofid)
+		p9_client_clunk(ofid);
+	goto out;
 }
 
 /**
  * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
  * @dir:  inode that is being unlinked
  * @dentry: dentry that is being unlinked
- * @mode: mode for new directory
+ * @omode: mode for new directory
  *
  */
 
 static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-			       struct dentry *dentry, int omode)
+			       struct dentry *dentry, umode_t omode)
 {
 	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
-	gid_t gid;
+	kgid_t gid;
 	char *name;
-	mode_t mode;
+	umode_t mode;
 	struct inode *inode;
 	struct p9_qid qid;
 	struct dentry *dir_dentry;
 	struct posix_acl *dacl = NULL, *pacl = NULL;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 	err = 0;
 	v9ses = v9fs_inode2v9ses(dir);
 
@@ -315,11 +401,11 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	if (dir->i_mode & S_ISGID)
 		omode |= S_ISGID;
 
-	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dir_dentry = dentry->d_parent;
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		dfid = NULL;
 		goto error;
 	}
@@ -329,8 +415,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in mkdir %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
+			 err);
 		goto error;
 	}
 	name = (char *) dentry->d_name.name;
@@ -338,49 +424,49 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	if (err < 0)
 		goto error;
 
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+			 err);
+		fid = NULL;
+		goto error;
+	}
+
 	/* instantiate inode and assign the unopened fid to the dentry */
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
-			fid = NULL;
-			goto error;
-		}
-
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
 			goto error;
 		}
+		v9fs_fid_add(dentry, fid);
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
 		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
 		fid = NULL;
+		err = 0;
 	} else {
 		/*
 		 * Not in cached mode. No need to populate
 		 * inode with stat. We need to get an inode
 		 * so that we can set the acl with dentry
 		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
+		inode = v9fs_get_inode(dir->i_sb, mode, 0);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			goto error;
 		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
 		d_instantiate(dentry, inode);
 	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
 	inc_nlink(dir);
 	v9fs_invalidate_inode_attr(dir);
 error:
 	if (fid)
 		p9_client_clunk(fid);
+	v9fs_put_acl(dacl, pacl);
 	return err;
 }
 
@@ -388,13 +474,11 @@ static int
 v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
-	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_stat_dotl *st;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-	err = -EPERM;
+	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		generic_fillattr(dentry->d_inode, stat);
@@ -421,6 +505,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
+/*
+ * Attribute flags.
+ */
+#define P9_ATTR_MODE		(1 << 0)
+#define P9_ATTR_UID		(1 << 1)
+#define P9_ATTR_GID		(1 << 2)
+#define P9_ATTR_SIZE		(1 << 3)
+#define P9_ATTR_ATIME		(1 << 4)
+#define P9_ATTR_MTIME		(1 << 5)
+#define P9_ATTR_CTIME		(1 << 6)
+#define P9_ATTR_ATIME_SET	(1 << 7)
+#define P9_ATTR_MTIME_SET	(1 << 8)
+
+struct dotl_iattr_map {
+	int iattr_valid;
+	int p9_iattr_valid;
+};
+
+static int v9fs_mapped_iattr_valid(int iattr_valid)
+{
+	int i;
+	int p9_iattr_valid = 0;
+	struct dotl_iattr_map dotl_iattr_map[] = {
+		{ ATTR_MODE,		P9_ATTR_MODE },
+		{ ATTR_UID,		P9_ATTR_UID },
+		{ ATTR_GID,		P9_ATTR_GID },
+		{ ATTR_SIZE,		P9_ATTR_SIZE },
+		{ ATTR_ATIME,		P9_ATTR_ATIME },
+		{ ATTR_MTIME,		P9_ATTR_MTIME },
+		{ ATTR_CTIME,		P9_ATTR_CTIME },
+		{ ATTR_ATIME_SET,	P9_ATTR_ATIME_SET },
+		{ ATTR_MTIME_SET,	P9_ATTR_MTIME_SET },
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
+		if (iattr_valid & dotl_iattr_map[i].iattr_valid)
+			p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
+	}
+	return p9_iattr_valid;
+}
+
 /**
  * v9fs_vfs_setattr_dotl - set file metadata
  * @dentry: file whose metadata to set
@@ -431,17 +555,17 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 {
 	int retval;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_iattr_dotl p9attr;
+	struct inode *inode = dentry->d_inode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 
-	retval = inode_change_ok(dentry->d_inode, iattr);
+	retval = inode_change_ok(inode, iattr);
 	if (retval)
 		return retval;
 
-	p9attr.valid = iattr->ia_valid;
+	p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
 	p9attr.mode = iattr->ia_mode;
 	p9attr.uid = iattr->ia_uid;
 	p9attr.gid = iattr->ia_gid;
@@ -451,30 +575,28 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
 	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
 
-	retval = -EPERM;
-	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 
 	/* Write all dirty data */
-	if (S_ISREG(dentry->d_inode->i_mode))
-		filemap_write_and_wait(dentry->d_inode->i_mapping);
+	if (S_ISREG(inode->i_mode))
+		filemap_write_and_wait(inode->i_mapping);
 
 	retval = p9_client_setattr(fid, &p9attr);
 	if (retval < 0)
 		return retval;
 
 	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(dentry->d_inode))
-		truncate_setsize(dentry->d_inode, iattr->ia_size);
+	    iattr->ia_size != i_size_read(inode))
+		truncate_setsize(inode, iattr->ia_size);
 
-	v9fs_invalidate_inode_attr(dentry->d_inode);
-	setattr_copy(dentry->d_inode, iattr);
-	mark_inode_dirty(dentry->d_inode);
+	v9fs_invalidate_inode_attr(inode);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE) {
 		/* We also want to update ACL when we update mode bits */
-		retval = v9fs_acl_chmod(dentry);
+		retval = v9fs_acl_chmod(inode, fid);
 		if (retval < 0)
 			return retval;
 	}
@@ -485,13 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
  * v9fs_stat2inode_dotl - populate an inode structure with stat info
  * @stat: stat structure
  * @inode: inode to populate
- * @sb: superblock of filesystem
  *
  */
 
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+	umode_t mode;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -503,12 +625,11 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
 		inode->i_uid = stat->st_uid;
 		inode->i_gid = stat->st_gid;
-		inode->i_nlink = stat->st_nlink;
-		inode->i_mode = stat->st_mode;
-		inode->i_rdev = new_decode_dev(stat->st_rdev);
+		set_nlink(inode, stat->st_nlink);
 
-		if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-			init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		mode = stat->st_mode & S_IALLUGO;
+		mode |= inode->i_mode & ~S_IALLUGO;
+		inode->i_mode = mode;
 
 		i_size_write(inode, stat->st_size);
 		inode->i_blocks = stat->st_blocks;
@@ -530,7 +651,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		if (stat->st_result_mask & P9_STATS_GID)
 			inode->i_gid = stat->st_gid;
 		if (stat->st_result_mask & P9_STATS_NLINK)
-			inode->i_nlink = stat->st_nlink;
+			set_nlink(inode, stat->st_nlink);
 		if (stat->st_result_mask & P9_STATS_MODE) {
 			inode->i_mode = stat->st_mode;
 			if ((S_ISBLK(inode->i_mode)) ||
@@ -546,7 +667,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 			inode->i_blocks = stat->st_blocks;
 	}
 	if (stat->st_result_mask & P9_STATS_GEN)
-			inode->i_generation = stat->st_gen;
+		inode->i_generation = stat->st_gen;
 
 	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
 	 * because the inode structure does not have fields for them.
@@ -559,7 +680,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		const char *symname)
 {
 	int err;
-	gid_t gid;
+	kgid_t gid;
 	char *name;
 	struct p9_qid qid;
 	struct inode *inode;
@@ -568,14 +689,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 	struct v9fs_session_info *v9ses;
 
 	name = (char *) dentry->d_name.name;
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
-			dir->i_ino, name, symname);
+	p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
 	v9ses = v9fs_inode2v9ses(dir);
 
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		return err;
 	}
 
@@ -585,38 +705,37 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
 
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
 		goto error;
 	}
 
 	v9fs_invalidate_inode_attr(dir);
-	if (v9ses->cache) {
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		/* Now walk from the parent so we can get an unopened fid. */
 		fid = p9_client_walk(dfid, 1, &name, 1);
 		if (IS_ERR(fid)) {
 			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-					err);
+			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				 err);
 			fid = NULL;
 			goto error;
 		}
 
 		/* instantiate inode and assign the unopened fid to dentry */
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-					err);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
 			goto error;
 		}
+		v9fs_fid_add(dentry, fid);
 		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
 		fid = NULL;
+		err = 0;
 	} else {
 		/* Not in cached mode. No need to populate inode with stat */
-		inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			goto error;
@@ -644,17 +763,15 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		struct dentry *dentry)
 {
 	int err;
-	char *name;
 	struct dentry *dir_dentry;
 	struct p9_fid *dfid, *oldfid;
 	struct v9fs_session_info *v9ses;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-			dir->i_ino, old_dentry->d_name.name,
-			dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+		 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
 
 	v9ses = v9fs_inode2v9ses(dir);
-	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dir_dentry = dentry->d_parent;
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid))
 		return PTR_ERR(dfid);
@@ -663,12 +780,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 	if (IS_ERR(oldfid))
 		return PTR_ERR(oldfid);
 
-	name = (char *) dentry->d_name.name;
-
 	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
 
 	if (err < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
 		return err;
 	}
 
@@ -692,18 +807,18 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
  * v9fs_vfs_mknod_dotl - create a special file
  * @dir: inode destination for new link
  * @dentry: dentry for file
- * @mode: mode for creation
+ * @omode: mode for creation
  * @rdev: device associated with special file
  *
  */
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 		dev_t rdev)
 {
 	int err;
-	gid_t gid;
+	kgid_t gid;
 	char *name;
-	mode_t mode;
+	umode_t mode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	struct inode *inode;
@@ -711,19 +826,19 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	struct dentry *dir_dentry;
 	struct posix_acl *dacl = NULL, *pacl = NULL;
 
-	P9_DPRINTK(P9_DEBUG_VFS,
-		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+	p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
+		 dir->i_ino, dentry->d_name.name, omode,
+		 MAJOR(rdev), MINOR(rdev));
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
 	v9ses = v9fs_inode2v9ses(dir);
-	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dir_dentry = dentry->d_parent;
 	dfid = v9fs_fid_lookup(dir_dentry);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
 		dfid = NULL;
 		goto error;
 	}
@@ -733,8 +848,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	/* Update mode based on ACL value */
 	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
 	if (err) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			   "Failed to get acl values in mknod %d\n", err);
+		p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
+			 err);
 		goto error;
 	}
 	name = (char *) dentry->d_name.name;
@@ -744,46 +859,46 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		goto error;
 
 	v9fs_invalidate_inode_attr(dir);
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+			 err);
+		fid = NULL;
+		goto error;
+	}
+
 	/* instantiate inode and assign the unopened fid to the dentry */
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				err);
-			fid = NULL;
-			goto error;
-		}
-
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-				err);
+			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+				 err);
 			goto error;
 		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
+		v9fs_fid_add(dentry, fid);
 		d_instantiate(dentry, inode);
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
 		fid = NULL;
+		err = 0;
 	} else {
 		/*
 		 * Not in cached mode. No need to populate inode with stat.
 		 * socket syscall returns a fd, so we need instantiate
 		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
+		inode = v9fs_get_inode(dir->i_sb, mode, rdev);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			goto error;
 		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
 		d_instantiate(dentry, inode);
 	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
 error:
 	if (fid)
 		p9_client_clunk(fid);
+	v9fs_put_acl(dacl, pacl);
 	return err;
 }
 
@@ -802,7 +917,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
 	char *link = __getname();
 	char *target;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+	p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
 
 	if (!link) {
 		link = ERR_PTR(-ENOMEM);
@@ -811,7 +926,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid)) {
 		__putname(link);
-		link = ERR_PTR(PTR_ERR(fid));
+		link = ERR_CAST(fid);
 		goto ndset;
 	}
 	retval = p9_client_readlink(fid, &target);
@@ -837,6 +952,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
 	if (IS_ERR(st))
 		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		goto out;
 
 	spin_lock(&inode->i_lock);
 	/*
@@ -845,15 +965,17 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	 */
 	i_size = inode->i_size;
 	v9fs_stat2inode_dotl(st, inode);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		inode->i_size = i_size;
 	spin_unlock(&inode->i_lock);
+out:
 	kfree(st);
 	return 0;
 }
 
 const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.create = v9fs_vfs_create_dotl,
+	.atomic_open = v9fs_vfs_atomic_open_dotl,
 	.lookup = v9fs_vfs_lookup,
 	.link = v9fs_vfs_link_dotl,
 	.symlink = v9fs_vfs_symlink_dotl,
@@ -868,7 +990,7 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.getxattr = generic_getxattr,
 	.removexattr = generic_removexattr,
 	.listxattr = v9fs_listxattr,
-	.check_acl = v9fs_check_acl,
+	.get_acl = v9fs_iop_get_acl,
 };
 
 const struct inode_operations v9fs_file_inode_operations_dotl = {
@@ -878,7 +1000,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 	.getxattr = generic_getxattr,
 	.removexattr = generic_removexattr,
 	.listxattr = v9fs_listxattr,
-	.check_acl = v9fs_check_acl,
+	.get_acl = v9fs_iop_get_acl,
 };
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f3eed3383e4..0afd0382822 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -89,7 +89,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	if (v9ses->cache)
 		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
 
-	sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
 	if (!v9ses->cache)
 		sb->s_flags |= MS_SYNCHRONOUS;
 
@@ -117,11 +117,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	struct inode *inode = NULL;
 	struct dentry *root = NULL;
 	struct v9fs_session_info *v9ses = NULL;
-	int mode = S_IRWXUGO | S_ISVTX;
+	umode_t mode = S_IRWXUGO | S_ISVTX;
 	struct p9_fid *fid;
 	int retval = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " \n");
+	p9_debug(P9_DEBUG_VFS, "\n");
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
@@ -137,26 +137,26 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		goto close_session;
 	}
 
-	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+	sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
 	}
 	v9fs_fill_super(sb, v9ses, flags, data);
 
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		sb->s_d_op = &v9fs_cached_dentry_operations;
 	else
 		sb->s_d_op = &v9fs_dentry_operations;
 
-	inode = v9fs_get_inode(sb, S_IFDIR | mode);
+	inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
 		goto release_sb;
 	}
-	root = d_alloc_root(inode);
+
+	root = d_make_root(inode);
 	if (!root) {
-		iput(inode);
 		retval = -ENOMEM;
 		goto release_sb;
 	}
@@ -185,23 +185,12 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		p9stat_free(st);
 		kfree(st);
 	}
-	v9fs_fid_add(root, fid);
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto release_sb;
-	/*
-	 * Add the root fid to session info. This is used
-	 * for file system sync. We want a cloned fid here
-	 * so that we can do a sync_filesystem after a
-	 * shrink_dcache_for_umount
-	 */
-	v9ses->root_fid = v9fs_fid_clone(root);
-	if (IS_ERR(v9ses->root_fid)) {
-		retval = PTR_ERR(v9ses->root_fid);
-		goto release_sb;
-	}
+	v9fs_fid_add(root, fid);
 
-	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
 	return dget(sb->s_root);
 
 clunk_fid:
@@ -210,11 +199,15 @@ close_session:
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
 	return ERR_PTR(retval);
+
 release_sb:
 	/*
-	 * we will do the session_close and root dentry
-	 * release in the below call.
+	 * we will do the session_close and root dentry release
+	 * in the below call. But we need to clunk fid, because we haven't
+	 * attached the fid to dentry so it won't get clunked
+	 * automatically.
 	 */
+	p9_client_clunk(fid);
 	deactivate_locked_super(sb);
 	return ERR_PTR(retval);
 }
@@ -229,15 +222,15 @@ static void v9fs_kill_super(struct super_block *s)
 {
 	struct v9fs_session_info *v9ses = s->s_fs_info;
 
-	P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
+	p9_debug(P9_DEBUG_VFS, " %p\n", s);
 
 	kill_anon_super(s);
-	p9_client_clunk(v9ses->root_fid);
+
 	v9fs_session_cancel(v9ses);
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
 	s->s_fs_info = NULL;
-	P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
+	p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
 }
 
 static void
@@ -266,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (v9fs_proto_dotl(v9ses)) {
 		res = p9_client_statfs(fid, &rs);
 		if (res == 0) {
-			buf->f_type = V9FS_MAGIC;
+			buf->f_type = rs.type;
 			buf->f_bsize = rs.bsize;
 			buf->f_blocks = rs.blocks;
 			buf->f_bfree = rs.bfree;
@@ -285,19 +278,11 @@ done:
 	return res;
 }
 
-static int v9fs_sync_fs(struct super_block *sb, int wait)
-{
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
-	return p9_client_sync_fs(v9ses->root_fid);
-}
-
 static int v9fs_drop_inode(struct inode *inode)
 {
 	struct v9fs_session_info *v9ses;
 	v9ses = v9fs_inode2v9ses(inode);
-	if (v9ses->cache)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		return generic_drop_inode(inode);
 	/*
 	 * in case of non cached mode always drop the
@@ -307,6 +292,53 @@ static int v9fs_drop_inode(struct inode *inode)
 	return 1;
 }
 
+static int v9fs_write_inode(struct inode *inode,
+			    struct writeback_control *wbc)
+{
+	int ret;
+	struct p9_wstat wstat;
+	struct v9fs_inode *v9inode;
+	/*
+	 * send an fsync request to server irrespective of
+	 * wbc->sync_mode.
+	 */
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+	v9inode = V9FS_I(inode);
+	if (!v9inode->writeback_fid)
+		return 0;
+	v9fs_blank_wstat(&wstat);
+
+	ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
+	if (ret < 0) {
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		return ret;
+	}
+	return 0;
+}
+
+static int v9fs_write_inode_dotl(struct inode *inode,
+				 struct writeback_control *wbc)
+{
+	int ret;
+	struct v9fs_inode *v9inode;
+	/*
+	 * send an fsync request to server irrespective of
+	 * wbc->sync_mode.
+	 */
+	v9inode = V9FS_I(inode);
+	p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
+		 __func__, inode, v9inode->writeback_fid);
+	if (!v9inode->writeback_fid)
+		return 0;
+
+	ret = p9_client_fsync(v9inode->writeback_fid, 0);
+	if (ret < 0) {
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		return ret;
+	}
+	return 0;
+}
+
 static const struct super_operations v9fs_super_ops = {
 	.alloc_inode = v9fs_alloc_inode,
 	.destroy_inode = v9fs_destroy_inode,
@@ -314,17 +346,18 @@ static const struct super_operations v9fs_super_ops = {
 	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
+	.write_inode = v9fs_write_inode,
 };
 
 static const struct super_operations v9fs_super_ops_dotl = {
 	.alloc_inode = v9fs_alloc_inode,
 	.destroy_inode = v9fs_destroy_inode,
-	.sync_fs = v9fs_sync_fs,
 	.statfs = v9fs_statfs,
 	.drop_inode = v9fs_drop_inode,
 	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
+	.write_inode = v9fs_write_inode_dotl,
 };
 
 struct file_system_type v9fs_fs_type = {
@@ -332,5 +365,6 @@ struct file_system_type v9fs_fs_type = {
 	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
-	.fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
+	.fs_flags = FS_RENAME_DOES_D_MOVE,
 };
+MODULE_ALIAS_FS("9p");
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index d288773871b..f95e01e058e 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
 	if (IS_ERR(attr_fid)) {
 		retval = PTR_ERR(attr_fid);
-		P9_DPRINTK(P9_DEBUG_VFS,
-			"p9_client_attrwalk failed %zd\n", retval);
+		p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
+			 retval);
 		attr_fid = NULL;
 		goto error;
 	}
@@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 {
 	struct p9_fid *fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
-		__func__, name, buffer_size);
+	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+		 name, buffer_size);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
@@ -111,27 +111,34 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 int v9fs_xattr_set(struct dentry *dentry, const char *name,
 		   const void *value, size_t value_len, int flags)
 {
+	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+	return v9fs_fid_xattr_set(fid, name, value, value_len, flags);
+}
+
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
 	u64 offset = 0;
 	int retval, msize, write_count;
-	struct p9_fid *fid = NULL;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
-		__func__, name, value_len, flags);
+	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
+		 name, value_len, flags);
+
+	/* Clone it */
+	fid = p9_client_walk(fid, 0, NULL, 1);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
 
-	fid = v9fs_fid_clone(dentry);
-	if (IS_ERR(fid)) {
-		retval = PTR_ERR(fid);
-		fid = NULL;
-		goto error;
-	}
 	/*
 	 * On success fid points to xattr
 	 */
 	retval = p9_client_xattrcreate(fid, name, value_len, flags);
 	if (retval < 0) {
-		P9_DPRINTK(P9_DEBUG_VFS,
-			"p9_client_xattrcreate failed %d\n", retval);
-		goto error;
+		p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
+			 retval);
+		goto err;
 	}
 	msize = fid->clnt->msize;
 	while (value_len) {
@@ -144,16 +151,14 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 		if (write_count < 0) {
 			/* error in xattr write */
 			retval = write_count;
-			goto error;
+			goto err;
 		}
 		offset += write_count;
 		value_len -= write_count;
 	}
-	/* Total read xattr bytes */
-	retval = offset;
-error:
-	if (fid)
-		retval = p9_client_clunk(fid);
+	retval = 0;
+err:
+	p9_client_clunk(fid);
 	return retval;
 }
 
@@ -164,9 +169,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 const struct xattr_handler *v9fs_xattr_handlers[] = {
 	&v9fs_xattr_user_handler,
+	&v9fs_xattr_trusted_handler,
 #ifdef CONFIG_9P_FS_POSIX_ACL
 	&v9fs_xattr_acl_access_handler,
 	&v9fs_xattr_acl_default_handler,
 #endif
+#ifdef CONFIG_9P_FS_SECURITY
+	&v9fs_xattr_security_handler,
+#endif
 	NULL
 };
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eaa837c53bd..d3e2ea3840b 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
 
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
+extern struct xattr_handler v9fs_xattr_trusted_handler;
+extern struct xattr_handler v9fs_xattr_security_handler;
 extern const struct xattr_handler v9fs_xattr_acl_access_handler;
 extern const struct xattr_handler v9fs_xattr_acl_default_handler;
 
@@ -27,6 +29,8 @@ extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
 				  void *, size_t);
 extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
 			      void *, size_t);
+extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
+			  const void *, size_t, int);
 extern int v9fs_xattr_set(struct dentry *, const char *,
 			  const void *, size_t, int);
 extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 00000000000..cb247a142a6
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= v9fs_xattr_security_get,
+	.set	= v9fs_xattr_security_set,
+};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 00000000000..e30d33b8a3f
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.get	= v9fs_xattr_trusted_get,
+	.set	= v9fs_xattr_trusted_set,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b22..312393f3294 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
 
 menu "File systems"
 
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
+
 if BLOCK
 
 source "fs/ext2/Kconfig"
@@ -24,8 +28,8 @@ config FS_MBCACHE
 	tristate
 	default y if EXT2_FS=y && EXT2_FS_XATTR
 	default y if EXT3_FS=y && EXT3_FS_XATTR
-	default y if EXT4_FS=y && EXT4_FS_XATTR
-	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+	default y if EXT4_FS=y
+	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
 
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
@@ -47,7 +51,7 @@ config FS_POSIX_ACL
 	def_bool n
 
 config EXPORTFS
-	bool
+	tristate
 
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
@@ -64,20 +68,6 @@ source "fs/quota/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
-config CUSE
-	tristate "Character device in Userspace support"
-	depends on FUSE_FS
-	help
-	  This FUSE extension allows character devices to be
-	  implemented in userspace.
-
-	  If you want to develop or use userspace character device
-	  based on CUSE, answer Y or M.
-
-config GENERIC_ACL
-	bool
-	select FS_POSIX_ACL
-
 menu "Caches"
 
 source "fs/fscache/Kconfig"
@@ -106,10 +96,11 @@ endif # BLOCK
 menu "Pseudo filesystems"
 
 source "fs/proc/Kconfig"
+source "fs/kernfs/Kconfig"
 source "fs/sysfs/Kconfig"
 
 config TMPFS
-	bool "Virtual memory file system support (former shm fs)"
+	bool "Tmpfs virtual memory file system support (former shm fs)"
 	depends on SHMEM
 	help
 	  Tmpfs is a file system which keeps all files in virtual memory.
@@ -124,15 +115,39 @@ config TMPFS
 config TMPFS_POSIX_ACL
 	bool "Tmpfs POSIX Access Control Lists"
 	depends on TMPFS
-	select GENERIC_ACL
+	select TMPFS_XATTR
+	select FS_POSIX_ACL
 	help
-	  POSIX Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
+	  POSIX Access Control Lists (ACLs) support additional access rights
+	  for users and groups beyond the standard owner/group/world scheme,
+	  and this option selects support for ACLs specifically for tmpfs
+	  filesystems.
+
+	  If you've selected TMPFS, it's possible that you'll also need
+	  this option as there are a number of Linux distros that require
+	  POSIX ACL support under /dev for certain features to work properly.
+	  For example, some distros need this feature for ALSA-related /dev
+	  files for sound to work properly.  In short, if you're not sure,
+	  say Y.
 
 	  To learn more about Access Control Lists, visit the POSIX ACLs for
 	  Linux website <http://acl.bestbits.at/>.
 
-	  If you don't know what Access Control Lists are, say N.
+config TMPFS_XATTR
+	bool "Tmpfs extended attributes"
+	depends on TMPFS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  Currently this enables support for the trusted.* and
+	  security.* namespaces.
+
+	  You need this for POSIX ACL support on tmpfs.
+
+	  If unsure, say N.
 
 config HUGETLBFS
 	bool "HugeTLB file system support"
@@ -186,14 +201,19 @@ source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
+source "fs/qnx6/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/f2fs/Kconfig"
+source "fs/efivarfs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
+source "fs/exofs/Kconfig.ore"
+
 menuconfig NETWORK_FILESYSTEMS
 	bool "Network File Systems"
 	default y
@@ -242,14 +262,6 @@ source "fs/9p/Kconfig"
 
 endif # NETWORK_FILESYSTEMS
 
-if BLOCK
-menu "Partition Types"
-
-source "fs/partitions/Kconfig"
-
-endmenu
-endif
-
 source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b..370b24cee4d 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,10 +27,13 @@ config COMPAT_BINFMT_ELF
 	bool
 	depends on COMPAT && BINFMT_ELF
 
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	bool
+
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
-	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
+	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
 	  segments of a binary to be located in memory independently of each
@@ -62,6 +65,20 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
 	  This config option changes the default setting of coredump_filter
 	  seen at boot time.  If unsure, say Y.
 
+config BINFMT_SCRIPT
+	tristate "Kernel support for scripts starting with #!"
+	default y
+	help
+	  Say Y here if you want to execute interpreted scripts starting with
+	  #! followed by the path to an interpreter.
+
+	  You can build this support as a module; however, until that module
+	  gets loaded, you cannot run scripts.  Thus, if you want to load this
+	  module from an initramfs, the portion of the initramfs before loading
+	  this module must consist of compiled binaries only.
+
+	  Most systems will not boot if you say M or N here.  If unsure, say Y.
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU && (!FRV || BROKEN)
@@ -161,3 +178,11 @@ config BINFMT_MISC
 	  You may say M here for module support and later load the module when
 	  you have use for it; the module is called binfmt_misc. If you
 	  don't know what to answer at this point, say Y.
+
+config COREDUMP
+	bool "Enable core dump support" if EXPERT
+	default y
+	help
+	  This option enables support for performing core dumps. You almost
+	  certainly want to say Y here. Not necessary on systems that never
+	  need debugging or only ever run flawless code.
diff --git a/fs/Makefile b/fs/Makefile
index fb68c2b8cf8..4030cbfbc9a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,19 +7,20 @@
 
 obj-y :=	open.o read_write.o file_table.o super.o \
 		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
-		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
+		ioctl.o readdir.o select.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
-		pnode.o drop_caches.o splice.o sync.o utimes.o \
+		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
 obj-y +=	no-block.o
 endif
 
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
+obj-$(CONFIG_PROC_FS) += proc_namespace.o
+
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
@@ -29,14 +30,10 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
-obj-$(CONFIG_NFSD_DEPRECATED)	+= nfsctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
-
-# binfmt_script is always there
-obj-y				+= binfmt_script.o
-
+obj-$(CONFIG_BINFMT_SCRIPT)	+= binfmt_script.o
 obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
 obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
@@ -44,16 +41,17 @@ obj-$(CONFIG_BINFMT_SOM)	+= binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
-obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
+obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
-obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
+obj-$(CONFIG_COREDUMP)		+= coredump.o
+obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
 
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
-obj-y				+= partitions/
+obj-$(CONFIG_KERNFS)		+= kernfs/
 obj-$(CONFIG_SYSFS)		+= sysfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
@@ -102,6 +100,7 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
 obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
+obj-$(CONFIG_QNX6FS_FS)		+= qnx6/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
@@ -121,6 +120,8 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
+obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a7460..c5a7787dd5e 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,6 @@
 config ADFS_FS
-	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "ADFS file system support"
+	depends on BLOCK
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 718ac1f440c..c770337c4b4 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -43,11 +43,14 @@ struct adfs_dir_ops;
  * ADFS file system superblock data in memory
  */
 struct adfs_sb_info {
-	struct adfs_discmap *s_map;	/* bh list containing map		 */
-	struct adfs_dir_ops *s_dir;	/* directory operations			 */
-
-	uid_t		s_uid;		/* owner uid				 */
-	gid_t		s_gid;		/* owner gid				 */
+	union { struct {
+		struct adfs_discmap *s_map;	/* bh list containing map	 */
+		struct adfs_dir_ops *s_dir;	/* directory operations		 */
+		};
+		struct rcu_head rcu;		/* used only at shutdown time	 */
+	};
+	kuid_t		s_uid;		/* owner uid				 */
+	kgid_t		s_gid;		/* owner gid				 */
 	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
 	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
 	int		s_ftsuffix;	/* ,xyz hex filetype suffix option */
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3d83075aaa2..0d138c0de29 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
 static DEFINE_RWLOCK(adfs_dir_lock);
 
 static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct object_info obj;
 	struct adfs_dir dir;
 	int ret = 0;
 
-	if (filp->f_pos >> 32)
-		goto out;
+	if (ctx->pos >> 32)
+		return 0;
 
 	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
 	if (ret)
-		goto out;
+		return ret;
 
-	switch ((unsigned long)filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
 			goto free_out;
-		filp->f_pos += 1;
-
-	case 1:
-		if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
 			goto free_out;
-		filp->f_pos += 1;
-
-	default:
-		break;
+		ctx->pos = 2;
 	}
 
 	read_lock(&adfs_dir_lock);
 
-	ret = ops->setpos(&dir, filp->f_pos - 2);
+	ret = ops->setpos(&dir, ctx->pos - 2);
 	if (ret)
 		goto unlock_out;
 	while (ops->getnext(&dir, &obj) == 0) {
-		if (filldir(dirent, obj.name, obj.name_len,
-			    filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
-			goto unlock_out;
-		filp->f_pos += 1;
+		if (!dir_emit(ctx, obj.name, obj.name_len,
+			    obj.file_id, DT_UNKNOWN))
+			break;
+		ctx->pos++;
 	}
 
 unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
 
 free_out:
 	ops->free(&dir);
-
-out:
 	return ret;
 }
 
@@ -192,13 +186,12 @@ out:
 const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
-	.readdir	= adfs_readdir,
+	.iterate	= adfs_readdir,
 	.fsync		= generic_file_fsync,
 };
 
 static int
-adfs_hash(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr)
+adfs_hash(const struct dentry *parent, struct qstr *qstr)
 {
 	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
 	const unsigned char *name;
@@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
  * requirements of the underlying filesystem.
  */
 static int
-adfs_compare(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+adfs_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i;
@@ -266,7 +258,7 @@ const struct dentry_operations adfs_dentry_operations = {
 };
 
 static struct dentry *
-adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct object_info obj;
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index a36da5382b4..07c9edce5aa 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -23,12 +23,12 @@
 
 const struct file_operations adfs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
+	.read		= new_sync_read,
+	.read_iter	= generic_file_read_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= generic_file_fsync,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 92444e94f84..b9acadafa4a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
 	return block_read_full_page(page, adfs_get_block);
 }
 
+static void adfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size)
+		truncate_pagecache(inode, inode->i_size);
+}
+
 static int adfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
 	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				adfs_get_block,
 				&ADFS_I(mapping->host)->mmu_private);
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
+	if (unlikely(ret))
+		adfs_write_failed(mapping, pos + len);
 
 	return ret;
 }
@@ -72,7 +77,6 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations adfs_aops = {
 	.readpage	= adfs_readpage,
 	.writepage	= adfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= adfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= _adfs_bmap
@@ -248,7 +252,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	inode->i_gid	 = ADFS_SB(sb)->s_gid;
 	inode->i_ino	 = obj->file_id;
 	inode->i_size	 = obj->size;
-	inode->i_nlink	 = 2;
+	set_nlink(inode, 2);
 	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
 			    sb->s_blocksize_bits;
 
@@ -305,8 +309,8 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	 * we can't change the UID or GID of any file -
 	 * we have a global UID/GID in the superblock
 	 */
-	if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) ||
-	    (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid))
+	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) ||
+	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid)))
 		error = -EPERM;
 
 	if (error)
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index d1a5932bb0f..6935f05202a 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -51,7 +51,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 
 /*
  * This is fun.  We need to load up to 19 bits from the map at an
- * arbitary bit alignment.  (We're limited to 19 bits by F+ version 2).
+ * arbitrary bit alignment.  (We're limited to 19 bits by F+ version 2).
  */
 #define GET_FRAG_ID(_map,_start,_idmask)				\
 	({								\
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c8bf36a1996..9852bdf34d7 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -15,6 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/user_namespace.h>
 #include "adfs.h"
 #include "dir_f.h"
 #include "dir_fplus.h"
@@ -122,18 +123,17 @@ static void adfs_put_super(struct super_block *sb)
 	for (i = 0; i < asb->s_map_size; i++)
 		brelse(asb->s_map[i].dm_bh);
 	kfree(asb->s_map);
-	kfree(asb);
-	sb->s_fs_info = NULL;
+	kfree_rcu(asb, rcu);
 }
 
-static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int adfs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb);
+	struct adfs_sb_info *asb = ADFS_SB(root->d_sb);
 
-	if (asb->s_uid != 0)
-		seq_printf(seq, ",uid=%u", asb->s_uid);
-	if (asb->s_gid != 0)
-		seq_printf(seq, ",gid=%u", asb->s_gid);
+	if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID))
+		seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid));
+	if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID))
+		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid));
 	if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK)
 		seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
 	if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
@@ -175,12 +175,16 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_uid:
 			if (match_int(args, &option))
 				return -EINVAL;
-			asb->s_uid = option;
+			asb->s_uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(asb->s_uid))
+				return -EINVAL;
 			break;
 		case Opt_gid:
 			if (match_int(args, &option))
 				return -EINVAL;
-			asb->s_gid = option;
+			asb->s_gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(asb->s_gid))
+				return -EINVAL;
 			break;
 		case Opt_ownmask:
 			if (match_octal(args, &option))
@@ -208,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
 
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 	return parse_options(sb, data);
 }
@@ -246,7 +251,6 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
 static void adfs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
@@ -262,7 +266,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
@@ -276,6 +280,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(adfs_inode_cachep);
 }
 
@@ -370,8 +379,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = asb;
 
 	/* set default options */
-	asb->s_uid = 0;
-	asb->s_gid = 0;
+	asb->s_uid = GLOBAL_ROOT_UID;
+	asb->s_gid = GLOBAL_ROOT_GID;
 	asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
 	asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
 	asb->s_ftsuffix = 0;
@@ -483,10 +492,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_d_op = &adfs_dentry_operations;
 	root = adfs_iget(sb, &root_obj);
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		int i;
-		iput(root);
 		for (i = 0; i < asb->s_map_size; i++)
 			brelse(asb->s_map[i].dm_bh);
 		kfree(asb->s_map);
@@ -516,6 +524,7 @@ static struct file_system_type adfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("adfs");
 
 static int __init init_adfs_fs(void)
 {
diff --git a/fs/affs/Changes b/fs/affs/Changes
index a29409c1ffe..b41c2c9792f 100644
--- a/fs/affs/Changes
+++ b/fs/affs/Changes
@@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel]
 Version 3.11
 ------------
 
-- Converted to use 2.3.x page cache [Dave Jones <dave@powertweak.com>]
+- Converted to use 2.3.x page cache [Dave Jones]
 - Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>]
 
 Version 3.10
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index cfad9afb476..a04d9e848d0 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -1,6 +1,6 @@
 config AFFS_FS
-	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "Amiga FFS file system support"
+	depends on BLOCK
 	help
 	  The Fast File System (FFS) is the common file system used on hard
 	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e95f73a702..9bca8815972 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -1,16 +1,15 @@
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/amigaffs.h>
 #include <linux/mutex.h>
-
-/* AmigaOS allows file names with up to 30 characters length.
- * Names longer than that will be silently truncated. If you
- * want to disallow this, comment out the following #define.
- * Creating filesystem objects with longer names will then
- * result in an error (ENAMETOOLONG).
- */
-/*#define AFFS_NO_TRUNCATE */
+#include <linux/workqueue.h>
 
 /* Ugly macros make the code more pretty. */
 
@@ -18,14 +17,6 @@
 #define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
 #define AFFS_BLOCK(sb, bh, blk)		(AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
 
-#ifdef __LITTLE_ENDIAN
-#define BO_EXBITS	0x18UL
-#elif defined(__BIG_ENDIAN)
-#define BO_EXBITS	0x00UL
-#else
-#error Endianness must be known for affs to work.
-#endif
-
 #define AFFS_HEAD(bh)		((struct affs_head *)(bh)->b_data)
 #define AFFS_TAIL(sb, bh)	((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
 #define AFFS_ROOT_HEAD(bh)	((struct affs_root_head *)(bh)->b_data)
@@ -35,7 +26,6 @@
 
 #define AFFS_CACHE_SIZE		PAGE_SIZE
 
-#define AFFS_MAX_PREALLOC	32
 #define AFFS_LC_SIZE		(AFFS_CACHE_SIZE/sizeof(u32)/2)
 #define AFFS_AC_SIZE		(AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
 #define AFFS_AC_MASK		(AFFS_AC_SIZE-1)
@@ -95,8 +85,8 @@ struct affs_sb_info {
 	u32 s_root_block;		/* FFS root block number. */
 	int s_hashsize;			/* Size of hash table. */
 	unsigned long s_flags;		/* See below. */
-	uid_t s_uid;			/* uid to override */
-	gid_t s_gid;			/* gid to override */
+	kuid_t s_uid;			/* uid to override */
+	kgid_t s_gid;			/* gid to override */
 	umode_t s_mode;			/* mode to override */
 	struct buffer_head *s_root_bh;	/* Cached root block. */
 	struct mutex s_bmlock;		/* Protects bitmap access. */
@@ -108,6 +98,10 @@ struct affs_sb_info {
 	char *s_prefix;			/* Prefix for volumes and assigns. */
 	char s_volume[32];		/* Volume prefix for absolute symlinks. */
 	spinlock_t symlink_lock;	/* protects the previous two */
+	struct super_block *sb;		/* the VFS superblock object */
+	int work_queued;		/* non-zero delayed work is queued */
+	struct delayed_work sb_work;	/* superblock flush delayed work */
+	spinlock_t work_lock;		/* protects sb_work and work_queued */
 };
 
 #define SF_INTL		0x0001		/* International filesystem. */
@@ -121,6 +115,7 @@ struct affs_sb_info {
 #define SF_OFS		0x0200		/* Old filesystem */
 #define SF_PREFIX	0x0400		/* Buffer for prefix is allocated */
 #define SF_VERBOSE	0x0800		/* Talk about fs when mounting */
+#define SF_NO_TRUNCATE	0x1000		/* Don't truncate filenames */
 
 /* short cut to get to the affs specific sb data */
 static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
@@ -128,6 +123,8 @@ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+void affs_mark_sb_dirty(struct super_block *sb);
+
 /* amigaffs.c */
 
 extern int	affs_insert_hash(struct inode *inode, struct buffer_head *bh);
@@ -136,11 +133,15 @@ extern int	affs_remove_header(struct dentry *dentry);
 extern u32	affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
 extern void	affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
 extern void	secs_to_datestamp(time_t secs, struct affs_date *ds);
-extern mode_t	prot_to_mode(u32 prot);
+extern umode_t	prot_to_mode(u32 prot);
 extern void	mode_to_prot(struct inode *inode);
-extern void	affs_error(struct super_block *sb, const char *function, const char *fmt, ...);
-extern void	affs_warning(struct super_block *sb, const char *function, const char *fmt, ...);
-extern int	affs_check_name(const unsigned char *name, int len);
+extern void	affs_error(struct super_block *sb, const char *function,
+			   const char *fmt, ...);
+extern void	affs_warning(struct super_block *sb, const char *function,
+			     const char *fmt, ...);
+extern bool	affs_nofilenametruncate(const struct dentry *dentry);
+extern int	affs_check_name(const unsigned char *name, int len,
+				bool notruncate);
 extern int	affs_copy_name(unsigned char *bstr, struct dentry *dentry);
 
 /* bitmap. c */
@@ -154,10 +155,10 @@ extern void	affs_free_bitmap(struct super_block *sb);
 /* namei.c */
 
 extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
-extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
+extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int	affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
-extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+extern int	affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool);
+extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
 extern int	affs_link(struct dentry *olddentry, struct inode *dir,
 			  struct dentry *dentry);
@@ -182,7 +183,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
-int		affs_file_fsync(struct file *, int);
+int		affs_file_fsync(struct file *, loff_t, loff_t, int);
 
 /* dir.c */
 
@@ -211,7 +212,7 @@ affs_set_blocksize(struct super_block *sb, int size)
 static inline struct buffer_head *
 affs_bread(struct super_block *sb, int block)
 {
-	pr_debug("affs_bread: %d\n", block);
+	pr_debug("%s: %d\n", __func__, block);
 	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
 		return sb_bread(sb, block);
 	return NULL;
@@ -219,7 +220,7 @@ affs_bread(struct super_block *sb, int block)
 static inline struct buffer_head *
 affs_getblk(struct super_block *sb, int block)
 {
-	pr_debug("affs_getblk: %d\n", block);
+	pr_debug("%s: %d\n", __func__, block);
 	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
 		return sb_getblk(sb, block);
 	return NULL;
@@ -228,7 +229,7 @@ static inline struct buffer_head *
 affs_getzeroblk(struct super_block *sb, int block)
 {
 	struct buffer_head *bh;
-	pr_debug("affs_getzeroblk: %d\n", block);
+	pr_debug("%s: %d\n", __func__, block);
 	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
 		bh = sb_getblk(sb, block);
 		lock_buffer(bh);
@@ -243,7 +244,7 @@ static inline struct buffer_head *
 affs_getemptyblk(struct super_block *sb, int block)
 {
 	struct buffer_head *bh;
-	pr_debug("affs_getemptyblk: %d\n", block);
+	pr_debug("%s: %d\n", __func__, block);
 	if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
 		bh = sb_getblk(sb, block);
 		wait_on_buffer(bh);
@@ -256,7 +257,7 @@ static inline void
 affs_brelse(struct buffer_head *bh)
 {
 	if (bh)
-		pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr);
+		pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr);
 	brelse(bh);
 }
 
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 3a4557e8325..406b29836b1 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -34,7 +34,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
 	ino = bh->b_blocknr;
 	offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
 
-	pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino);
+	pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino);
 
 	dir_bh = affs_bread(sb, dir->i_ino);
 	if (!dir_bh)
@@ -84,7 +84,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
 	sb = dir->i_sb;
 	rem_ino = rem_bh->b_blocknr;
 	offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
-	pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset);
+	pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n",
+		 __func__, (u32)dir->i_ino, rem_ino, offset);
 
 	bh = affs_bread(sb, dir->i_ino);
 	if (!bh)
@@ -122,22 +123,15 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
 }
 
 static void
-affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
+affs_fix_dcache(struct inode *inode, u32 entry_ino)
 {
-	struct inode *inode = dentry->d_inode;
-	void *data = dentry->d_fsdata;
-	struct list_head *head, *next;
-
+	struct dentry *dentry;
 	spin_lock(&inode->i_lock);
-	head = &inode->i_dentry;
-	next = head->next;
-	while (next != head) {
-		dentry = list_entry(next, struct dentry, d_alias);
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (entry_ino == (u32)(long)dentry->d_fsdata) {
-			dentry->d_fsdata = data;
+			dentry->d_fsdata = (void *)inode->i_ino;
 			break;
 		}
-		next = next->next;
 	}
 	spin_unlock(&inode->i_lock);
 }
@@ -154,7 +148,7 @@ affs_remove_link(struct dentry *dentry)
 	u32 link_ino, ino;
 	int retval;
 
-	pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino);
+	pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
 	retval = -EIO;
 	bh = affs_bread(sb, inode->i_ino);
 	if (!bh)
@@ -177,7 +171,11 @@ affs_remove_link(struct dentry *dentry)
 		}
 
 		affs_lock_dir(dir);
-		affs_fix_dcache(dentry, link_ino);
+		/*
+		 * if there's a dentry for that block, make it
+		 * refer to inode itself.
+		 */
+		affs_fix_dcache(inode, link_ino);
 		retval = affs_remove_hash(dir, link_bh);
 		if (retval) {
 			affs_unlock_dir(dir);
@@ -215,7 +213,7 @@ affs_remove_link(struct dentry *dentry)
 				break;
 			default:
 				if (!AFFS_TAIL(sb, bh)->link_chain)
-					inode->i_nlink = 1;
+					set_nlink(inode, 1);
 			}
 			affs_free_block(sb, link_ino);
 			goto done;
@@ -282,7 +280,7 @@ affs_remove_header(struct dentry *dentry)
 	if (!inode)
 		goto done;
 
-	pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino);
+	pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
 	retval = -EIO;
 	bh = affs_bread(sb, (u32)(long)dentry->d_fsdata);
 	if (!bh)
@@ -316,7 +314,7 @@ affs_remove_header(struct dentry *dentry)
 	if (inode->i_nlink > 1)
 		retval = affs_remove_link(dentry);
 	else
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 	affs_unlock_link(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
@@ -390,10 +388,10 @@ secs_to_datestamp(time_t secs, struct affs_date *ds)
 	ds->ticks = cpu_to_be32(secs * 50);
 }
 
-mode_t
+umode_t
 prot_to_mode(u32 prot)
 {
-	int mode = 0;
+	umode_t mode = 0;
 
 	if (!(prot & FIBF_NOWRITE))
 		mode |= S_IWUSR;
@@ -421,7 +419,7 @@ void
 mode_to_prot(struct inode *inode)
 {
 	u32 prot = AFFS_I(inode)->i_protect;
-	mode_t mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 
 	if (!(mode & S_IXUSR))
 		prot |= FIBF_NOEXECUTE;
@@ -454,10 +452,10 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
 	vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
 	va_end(args);
 
-	printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id,
+	pr_crit("error (device %s): %s(): %s\n", sb->s_id,
 		function,ErrorBuffer);
 	if (!(sb->s_flags & MS_RDONLY))
-		printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n");
+		pr_warn("Remounting filesystem read-only\n");
 	sb->s_flags |= MS_RDONLY;
 }
 
@@ -470,24 +468,31 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
 	vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
 	va_end(args);
 
-	printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id,
+	pr_warn("(device %s): %s(): %s\n", sb->s_id,
 		function,ErrorBuffer);
 }
 
+bool
+affs_nofilenametruncate(const struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
+
+}
+
 /* Check if the name is valid for a affs object. */
 
 int
-affs_check_name(const unsigned char *name, int len)
+affs_check_name(const unsigned char *name, int len, bool notruncate)
 {
 	int	 i;
 
-	if (len > 30)
-#ifdef AFFS_NO_TRUNCATE
-		return -ENAMETOOLONG;
-#else
-		len = 30;
-#endif
-
+	if (len > 30) {
+		if (notruncate)
+			return -ENAMETOOLONG;
+		else
+			len = 30;
+	}
 	for (i = 0; i < len; i++) {
 		if (name[i] < ' ' || name[i] == ':'
 		    || (name[i] > 0x7e && name[i] < 0xa0))
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 3e262711ae0..c8de51185c2 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -10,30 +10,6 @@
 #include <linux/slab.h>
 #include "affs.h"
 
-/* This is, of course, shamelessly stolen from fs/minix */
-
-static const int nibblemap[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
-
-static u32
-affs_count_free_bits(u32 blocksize, const void *data)
-{
-	const u32 *map;
-	u32 free;
-	u32 tmp;
-
-	map = data;
-	free = 0;
-	for (blocksize /= 4; blocksize > 0; blocksize--) {
-		tmp = *map++;
-		while (tmp) {
-			free += nibblemap[tmp & 0xf];
-			tmp >>= 4;
-		}
-	}
-
-	return free;
-}
-
 u32
 affs_count_free_blocks(struct super_block *sb)
 {
@@ -41,7 +17,7 @@ affs_count_free_blocks(struct super_block *sb)
 	u32 free;
 	int i;
 
-	pr_debug("AFFS: count_free_blocks()\n");
+	pr_debug("%s()\n", __func__);
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
@@ -67,7 +43,7 @@ affs_free_block(struct super_block *sb, u32 block)
 	u32 blk, bmap, bit, mask, tmp;
 	__be32 *data;
 
-	pr_debug("AFFS: free_block(%u)\n", block);
+	pr_debug("%s(%u)\n", __func__, block);
 
 	if (block > sbi->s_partition_size)
 		goto err_range;
@@ -103,7 +79,7 @@ affs_free_block(struct super_block *sb, u32 block)
 	*(__be32 *)bh->b_data = cpu_to_be32(tmp - mask);
 
 	mark_buffer_dirty(bh);
-	sb->s_dirt = 1;
+	affs_mark_sb_dirty(sb);
 	bm->bm_free++;
 
 	mutex_unlock(&sbi->s_bmlock);
@@ -149,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
 	sb = inode->i_sb;
 	sbi = AFFS_SB(sb);
 
-	pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
+	pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
 
 	if (AFFS_I(inode)->i_pa_cnt) {
 		pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1);
@@ -248,7 +224,7 @@ find_bit:
 	*(__be32 *)bh->b_data = cpu_to_be32(tmp + mask);
 
 	mark_buffer_dirty(bh);
-	sb->s_dirt = 1;
+	affs_mark_sb_dirty(sb);
 
 	mutex_unlock(&sbi->s_bmlock);
 
@@ -278,8 +254,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 		return 0;
 
 	if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
-		printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n",
-			sb->s_id);
+		pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
 		*flags |= MS_RDONLY;
 		return 0;
 	}
@@ -292,7 +267,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 	size = sbi->s_bmap_count * sizeof(*bm);
 	bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL);
 	if (!sbi->s_bitmap) {
-		printk(KERN_ERR "AFFS: Bitmap allocation failed\n");
+		pr_err("Bitmap allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -306,18 +281,18 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 		bm->bm_key = be32_to_cpu(bmap_blk[blk]);
 		bh = affs_bread(sb, bm->bm_key);
 		if (!bh) {
-			printk(KERN_ERR "AFFS: Cannot read bitmap\n");
+			pr_err("Cannot read bitmap\n");
 			res = -EIO;
 			goto out;
 		}
 		if (affs_checksum_block(sb, bh)) {
-			printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n",
-			       bm->bm_key, sb->s_id);
+			pr_warn("Bitmap %u invalid - mounting %s read only.\n",
+				bm->bm_key, sb->s_id);
 			*flags |= MS_RDONLY;
 			goto out;
 		}
-		pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key);
-		bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4);
+		pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
+		bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
 
 		/* Don't try read the extension if this is the last block,
 		 * but we also need the right bm pointer below
@@ -328,7 +303,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 			affs_brelse(bmap_bh);
 		bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk]));
 		if (!bmap_bh) {
-			printk(KERN_ERR "AFFS: Cannot read bitmap extension\n");
+			pr_err("Cannot read bitmap extension\n");
 			res = -EIO;
 			goto out;
 		}
@@ -367,7 +342,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
 
 	/* recalculate bitmap count for last block */
 	bm--;
-	bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4);
+	bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
 
 out:
 	affs_brelse(bh);
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 8ca8f3a5559..59f07bec92a 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
 
 #include "affs.h"
 
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
 
 const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
-	.readdir	= affs_readdir,
+	.iterate	= affs_readdir,
 	.fsync		= affs_file_fsync,
 };
 
@@ -40,63 +40,48 @@ const struct inode_operations affs_dir_inode_operations = {
 };
 
 static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct inode		*inode = file_inode(file);
 	struct super_block	*sb = inode->i_sb;
-	struct buffer_head	*dir_bh;
-	struct buffer_head	*fh_bh;
+	struct buffer_head	*dir_bh = NULL;
+	struct buffer_head	*fh_bh = NULL;
 	unsigned char		*name;
 	int			 namelen;
 	u32			 i;
 	int			 hash_pos;
 	int			 chain_pos;
-	u32			 f_pos;
 	u32			 ino;
-	int			 stored;
-	int			 res;
+	int			 error = 0;
 
-	pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+	pr_debug("%s(ino=%lu,f_pos=%lx)\n",
+		 __func__, inode->i_ino, (unsigned long)ctx->pos);
 
-	stored = 0;
-	res    = -EIO;
-	dir_bh = NULL;
-	fh_bh  = NULL;
-	f_pos  = filp->f_pos;
-
-	if (f_pos == 0) {
-		filp->private_data = (void *)0;
-		if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
+	if (ctx->pos < 2) {
+		file->private_data = (void *)0;
+		if (!dir_emit_dots(file, ctx))
 			return 0;
-		filp->f_pos = f_pos = 1;
-		stored++;
-	}
-	if (f_pos == 1) {
-		if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
-			return stored;
-		filp->f_pos = f_pos = 2;
-		stored++;
 	}
 
 	affs_lock_dir(inode);
-	chain_pos = (f_pos - 2) & 0xffff;
-	hash_pos  = (f_pos - 2) >> 16;
+	chain_pos = (ctx->pos - 2) & 0xffff;
+	hash_pos  = (ctx->pos - 2) >> 16;
 	if (chain_pos == 0xffff) {
 		affs_warning(sb, "readdir", "More than 65535 entries in chain");
 		chain_pos = 0;
 		hash_pos++;
-		filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+		ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
 	}
 	dir_bh = affs_bread(sb, inode->i_ino);
 	if (!dir_bh)
-		goto readdir_out;
+		goto out_unlock_dir;
 
 	/* If the directory hasn't changed since the last call to readdir(),
 	 * we can jump directly to where we left off.
 	 */
-	ino = (u32)(long)filp->private_data;
-	if (ino && filp->f_version == inode->i_version) {
-		pr_debug("AFFS: readdir() left off=%d\n", ino);
+	ino = (u32)(long)file->private_data;
+	if (ino && file->f_version == inode->i_version) {
+		pr_debug("readdir() left off=%d\n", ino);
 		goto inside;
 	}
 
@@ -105,7 +90,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		fh_bh = affs_bread(sb, ino);
 		if (!fh_bh) {
 			affs_error(sb, "readdir","Cannot read block %d", i);
-			goto readdir_out;
+			error = -EIO;
+			goto out_brelse_dir;
 		}
 		ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
 		affs_brelse(fh_bh);
@@ -119,38 +105,39 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
 		if (!ino)
 			continue;
-		f_pos = (hash_pos << 16) + 2;
+		ctx->pos = (hash_pos << 16) + 2;
 inside:
 		do {
 			fh_bh = affs_bread(sb, ino);
 			if (!fh_bh) {
-				affs_error(sb, "readdir","Cannot read block %d", ino);
-				goto readdir_done;
+				affs_error(sb, "readdir",
+					   "Cannot read block %d", ino);
+				break;
 			}
 
 			namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
 			name = AFFS_TAIL(sb, fh_bh)->name + 1;
-			pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
-				 namelen, name, ino, hash_pos, f_pos);
-			if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
-				goto readdir_done;
-			stored++;
-			f_pos++;
+			pr_debug("readdir(): dir_emit(\"%.*s\", "
+				 "ino=%u), hash=%d, f_pos=%x\n",
+				 namelen, name, ino, hash_pos, (u32)ctx->pos);
+
+			if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
+				goto done;
+			ctx->pos++;
 			ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
 			affs_brelse(fh_bh);
 			fh_bh = NULL;
 		} while (ino);
 	}
-readdir_done:
-	filp->f_pos = f_pos;
-	filp->f_version = inode->i_version;
-	filp->private_data = (void *)(long)ino;
-	res = stored;
+done:
+	file->f_version = inode->i_version;
+	file->private_data = (void *)(long)ino;
+	affs_brelse(fh_bh);
 
-readdir_out:
+out_brelse_dir:
 	affs_brelse(dir_bh);
-	affs_brelse(fh_bh);
+
+out_unlock_dir:
 	affs_unlock_dir(inode);
-	pr_debug("AFFS: readdir()=%d\n", stored);
-	return res;
+	return error;
 }
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de..a7fe57d2cd9 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -27,10 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp);
 
 const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.read		= new_sync_read,
+	.read_iter	= generic_file_read_iter,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
@@ -39,14 +39,13 @@ const struct file_operations affs_file_operations = {
 };
 
 const struct inode_operations affs_file_inode_operations = {
-	.truncate	= affs_truncate,
 	.setattr	= affs_notify_change,
 };
 
 static int
 affs_file_open(struct inode *inode, struct file *filp)
 {
-	pr_debug("AFFS: open(%lu,%d)\n",
+	pr_debug("open(%lu,%d)\n",
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 	atomic_inc(&AFFS_I(inode)->i_opencnt);
 	return 0;
@@ -55,7 +54,7 @@ affs_file_open(struct inode *inode, struct file *filp)
 static int
 affs_file_release(struct inode *inode, struct file *filp)
 {
-	pr_debug("AFFS: release(%lu, %d)\n",
+	pr_debug("release(%lu, %d)\n",
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 
 	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
@@ -325,7 +324,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
 	struct buffer_head	*ext_bh;
 	u32			 ext;
 
-	pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
+	pr_debug("%s(%u, %lu)\n",
+		 __func__, (u32)inode->i_ino, (unsigned long)block);
 
 	BUG_ON(block > (sector_t)0x7fffffffUL);
 
@@ -402,6 +402,16 @@ static int affs_readpage(struct file *file, struct page *page)
 	return block_read_full_page(page, affs_get_block);
 }
 
+static void affs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		affs_truncate(inode);
+	}
+}
+
 static int affs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -412,11 +422,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
 	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				affs_get_block,
 				&AFFS_I(mapping->host)->mmu_private);
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
+	if (unlikely(ret))
+		affs_write_failed(mapping, pos + len);
 
 	return ret;
 }
@@ -429,7 +436,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations affs_aops = {
 	.readpage = affs_readpage,
 	.writepage = affs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = affs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = _affs_bmap
@@ -493,34 +499,36 @@ affs_getemptyblk_ino(struct inode *inode, int block)
 }
 
 static int
-affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to)
+affs_do_readpage_ofs(struct page *page, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bh;
 	char *data;
+	unsigned pos = 0;
 	u32 bidx, boff, bsize;
 	u32 tmp;
 
-	pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
-	BUG_ON(from > to || to > PAGE_CACHE_SIZE);
+	pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+		 page->index, to);
+	BUG_ON(to > PAGE_CACHE_SIZE);
 	kmap(page);
 	data = page_address(page);
 	bsize = AFFS_SB(sb)->s_data_blksize;
-	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	tmp = page->index << PAGE_CACHE_SHIFT;
 	bidx = tmp / bsize;
 	boff = tmp % bsize;
 
-	while (from < to) {
+	while (pos < to) {
 		bh = affs_bread_ino(inode, bidx, 0);
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
-		tmp = min(bsize - boff, to - from);
-		BUG_ON(from + tmp > to || tmp > bsize);
-		memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
+		tmp = min(bsize - boff, to - pos);
+		BUG_ON(pos + tmp > to || tmp > bsize);
+		memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
 		affs_brelse(bh);
 		bidx++;
-		from += tmp;
+		pos += tmp;
 		boff = 0;
 	}
 	flush_dcache_page(page);
@@ -537,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 	u32 size, bsize;
 	u32 tmp;
 
-	pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize);
+	pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
 	bsize = AFFS_SB(sb)->s_data_blksize;
 	bh = NULL;
 	size = AFFS_I(inode)->mmu_private;
@@ -603,14 +611,14 @@ affs_readpage_ofs(struct file *file, struct page *page)
 	u32 to;
 	int err;
 
-	pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index);
+	pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
 	to = PAGE_CACHE_SIZE;
 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
 		to = inode->i_size & ~PAGE_CACHE_MASK;
 		memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
 	}
 
-	err = affs_do_readpage_ofs(file, page, 0, to);
+	err = affs_do_readpage_ofs(page, to);
 	if (!err)
 		SetPageUptodate(page);
 	unlock_page(page);
@@ -626,7 +634,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 	pgoff_t index;
 	int err = 0;
 
-	pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+	pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
+		 (unsigned long long)pos, (unsigned long long)pos + len);
 	if (pos > AFFS_I(inode)->mmu_private) {
 		/* XXX: this probably leaves a too-big i_size in case of
 		 * failure. Should really be updating i_size at write_end time
@@ -646,7 +655,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 		return 0;
 
 	/* XXX: inefficient but safe in the face of short writes */
-	err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE);
+	err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
 	if (err) {
 		unlock_page(page);
 		page_cache_release(page);
@@ -675,7 +684,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 	 * due to write_begin.
 	 */
 
-	pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+	pr_debug("%s(%u, %llu, %llu)\n",
+		 __func__, (u32)inode->i_ino, (unsigned long long)pos,
+		(unsigned long long)pos + len);
 	bsize = AFFS_SB(sb)->s_data_blksize;
 	data = page_address(page);
 
@@ -786,7 +797,6 @@ out:
 const struct address_space_operations affs_aops_ofs = {
 	.readpage = affs_readpage_ofs,
 	//.writepage = affs_writepage_ofs,
-	//.sync_page = affs_sync_page_ofs,
 	.write_begin = affs_write_begin_ofs,
 	.write_end = affs_write_end_ofs
 };
@@ -798,7 +808,7 @@ affs_free_prealloc(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
-	pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino);
+	pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino);
 
 	while (AFFS_I(inode)->i_pa_cnt) {
 		AFFS_I(inode)->i_pa_cnt--;
@@ -818,7 +828,7 @@ affs_truncate(struct inode *inode)
 	struct buffer_head *ext_bh;
 	int i;
 
-	pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+	pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
 		 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
 
 	last_blk = 0;
@@ -832,7 +842,7 @@ affs_truncate(struct inode *inode)
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
-		u32 size = inode->i_size;
+		loff_t size = inode->i_size;
 		int res;
 
 		res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
@@ -925,14 +935,20 @@ affs_truncate(struct inode *inode)
 	affs_free_prealloc(inode);
 }
 
-int affs_file_fsync(struct file *filp, int datasync)
+int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int ret, err;
 
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
 	ret = write_inode_now(inode, 0);
 	err = sync_blockdev(inode->i_sb->s_bdev);
 	if (!ret)
 		ret = err;
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5d828903ac6..bec2d1a0c91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -34,7 +34,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino);
+	pr_debug("affs_iget(%lu)\n", inode->i_ino);
 
 	block = inode->i_ino;
 	bh = affs_bread(sb, block);
@@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	prot = be32_to_cpu(tail->protect);
 
 	inode->i_size = 0;
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	inode->i_mode = 0;
 	AFFS_I(inode)->i_extcnt = 1;
 	AFFS_I(inode)->i_ext_last = ~1;
@@ -80,17 +80,17 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	if (id == 0 || sbi->s_flags & SF_SETUID)
 		inode->i_uid = sbi->s_uid;
 	else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
-		inode->i_uid = 0;
+		i_uid_write(inode, 0);
 	else
-		inode->i_uid = id;
+		i_uid_write(inode, id);
 
 	id = be16_to_cpu(tail->gid);
 	if (id == 0 || sbi->s_flags & SF_SETGID)
 		inode->i_gid = sbi->s_gid;
 	else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
-		inode->i_gid = 0;
+		i_gid_write(inode, 0);
 	else
-		inode->i_gid = id;
+		i_gid_write(inode, id);
 
 	switch (be32_to_cpu(tail->stype)) {
 	case ST_ROOT:
@@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 					       sbi->s_hashsize + 1;
 		}
 		if (tail->link_chain)
-			inode->i_nlink = 2;
+			set_nlink(inode, 2);
 		inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
 		inode->i_op = &affs_file_inode_operations;
 		inode->i_fop = &affs_file_operations;
@@ -175,7 +175,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	uid_t			 uid;
 	gid_t			 gid;
 
-	pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino);
+	pr_debug("write_inode(%lu)\n", inode->i_ino);
 
 	if (!inode->i_nlink)
 		// possibly free block
@@ -193,13 +193,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		tail->size = cpu_to_be32(inode->i_size);
 		secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change);
 		if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
-			uid = inode->i_uid;
-			gid = inode->i_gid;
+			uid = i_uid_read(inode);
+			gid = i_gid_read(inode);
 			if (AFFS_SB(sb)->s_flags & SF_MUFS) {
-				if (inode->i_uid == 0 || inode->i_uid == 0xFFFF)
-					uid = inode->i_uid ^ ~0;
-				if (inode->i_gid == 0 || inode->i_gid == 0xFFFF)
-					gid = inode->i_gid ^ ~0;
+				if (uid == 0 || uid == 0xFFFF)
+					uid = uid ^ ~0;
+				if (gid == 0 || gid == 0xFFFF)
+					gid = gid ^ ~0;
 			}
 			if (!(AFFS_SB(sb)->s_flags & SF_SETUID))
 				tail->uid = cpu_to_be16(uid);
@@ -220,7 +220,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid);
+	pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
 
 	error = inode_change_ok(inode,attr);
 	if (error)
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, attr->ia_size);
+		error = inode_newsize_ok(inode, attr->ia_size);
 		if (error)
 			return error;
+
+		truncate_setsize(inode, attr->ia_size);
+		affs_truncate(inode);
 	}
 
 	setattr_copy(inode, attr);
@@ -255,8 +258,9 @@ void
 affs_evict_inode(struct inode *inode)
 {
 	unsigned long cache_page;
-	pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
-	truncate_inode_pages(&inode->i_data, 0);
+	pr_debug("evict_inode(ino=%lu, nlink=%u)\n",
+		 inode->i_ino, inode->i_nlink);
+	truncate_inode_pages_final(&inode->i_data);
 
 	if (!inode->i_nlink) {
 		inode->i_size = 0;
@@ -264,11 +268,11 @@ affs_evict_inode(struct inode *inode)
 	}
 
 	invalidate_inode_buffers(inode);
-	end_writeback(inode);
+	clear_inode(inode);
 	affs_free_prealloc(inode);
 	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
 	if (cache_page) {
-		pr_debug("AFFS: freeing ext cache\n");
+		pr_debug("freeing ext cache\n");
 		AFFS_I(inode)->i_lc = NULL;
 		AFFS_I(inode)->i_ac = NULL;
 		free_page(cache_page);
@@ -304,7 +308,7 @@ affs_new_inode(struct inode *dir)
 	inode->i_uid     = current_fsuid();
 	inode->i_gid     = current_fsgid();
 	inode->i_ino     = block;
-	inode->i_nlink   = 1;
+	set_nlink(inode, 1);
 	inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
@@ -347,7 +351,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 	u32 block = 0;
 	int retval;
 
-	pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino,
+	pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n",
+		 __func__, (u32)dir->i_ino,
 	         (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type);
 
 	retval = -EIO;
@@ -387,7 +392,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 		AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
 		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
 		mark_buffer_dirty_inode(inode_bh, inode);
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		ihold(inode);
 	}
 	affs_fix_checksum(sb, bh);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd..035bd31556f 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
 typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
-static int	 affs_hash_dentry(const struct dentry *,
-		const struct inode *, struct qstr *);
-static int       affs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int	 affs_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
-static int	 affs_intl_hash_dentry(const struct dentry *,
-		const struct inode *, struct qstr *);
-static int       affs_intl_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int	 affs_intl_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 const struct dentry_operations affs_dentry_operations = {
@@ -66,13 +60,13 @@ affs_get_toupper(struct super_block *sb)
  * Note: the dentry argument is the parent dentry.
  */
 static inline int
-__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
 {
 	const u8 *name = qstr->name;
 	unsigned long hash;
 	int i;
 
-	i = affs_check_name(qstr->name, qstr->len);
+	i = affs_check_name(qstr->name, qstr->len, notruncate);
 	if (i)
 		return i;
 
@@ -86,20 +80,24 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 }
 
 static int
-affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
-	return __affs_hash_dentry(qstr, affs_toupper);
+	return __affs_hash_dentry(qstr, affs_toupper,
+				  affs_nofilenametruncate(dentry));
+
 }
+
 static int
-affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
-	return __affs_hash_dentry(qstr, affs_intl_toupper);
+	return __affs_hash_dentry(qstr, affs_intl_toupper,
+				  affs_nofilenametruncate(dentry));
+
 }
 
 static inline int __affs_compare_dentry(unsigned int len,
-		const char *str, const struct qstr *name, toupper_t toupper)
+		const char *str, const struct qstr *name, toupper_t toupper,
+		bool notruncate)
 {
 	const u8 *aname = str;
 	const u8 *bname = name->name;
@@ -109,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len,
 	 * must be valid. 'name' must be validated first.
 	 */
 
-	if (affs_check_name(name->name, name->len))
+	if (affs_check_name(name->name, name->len, notruncate))
 		return 1;
 
 	/*
@@ -131,18 +129,21 @@ static inline int __affs_compare_dentry(unsigned int len,
 }
 
 static int
-affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
-	return __affs_compare_dentry(len, str, name, affs_toupper);
+
+	return __affs_compare_dentry(len, str, name, affs_toupper,
+				     affs_nofilenametruncate(parent));
 }
+
 static int
-affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
-	return __affs_compare_dentry(len, str, name, affs_intl_toupper);
+	return __affs_compare_dentry(len, str, name, affs_intl_toupper,
+				     affs_nofilenametruncate(parent));
+
 }
 
 /*
@@ -189,7 +190,8 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
 	toupper_t toupper = affs_get_toupper(sb);
 	u32 key;
 
-	pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name);
+	pr_debug("%s(\"%.*s\")\n",
+		 __func__, (int)dentry->d_name.len, dentry->d_name.name);
 
 	bh = affs_bread(sb, dir->i_ino);
 	if (!bh)
@@ -211,13 +213,14 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
 }
 
 struct dentry *
-affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct super_block *sb = dir->i_sb;
 	struct buffer_head *bh;
 	struct inode *inode = NULL;
 
-	pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name);
+	pr_debug("%s(\"%.*s\")\n",
+		 __func__, (int)dentry->d_name.len, dentry->d_name.name);
 
 	affs_lock_dir(dir);
 	bh = affs_find_entry(dir, dentry);
@@ -247,21 +250,22 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 int
 affs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
-		 dentry->d_inode->i_ino,
-		 (int)dentry->d_name.len, dentry->d_name.name);
+	pr_debug("%s(dir=%d, %lu \"%.*s\")\n",
+		 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+		(int)dentry->d_name.len, dentry->d_name.name);
 
 	return affs_remove_header(dentry);
 }
 
 int
-affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode	*inode;
 	int		 error;
 
-	pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len,
+	pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
+		 __func__, dir->i_ino, (int)dentry->d_name.len,
 		 dentry->d_name.name,mode);
 
 	inode = affs_new_inode(dir);
@@ -277,7 +281,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
 	inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
 	error = affs_add_entry(dir, inode, dentry, ST_FILE);
 	if (error) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		iput(inode);
 		return error;
 	}
@@ -285,13 +289,14 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
 }
 
 int
-affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode		*inode;
 	int			 error;
 
-	pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino,
-		 (int)dentry->d_name.len,dentry->d_name.name,mode);
+	pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
+		 __func__, dir->i_ino, (int)dentry->d_name.len,
+		 dentry->d_name.name, mode);
 
 	inode = affs_new_inode(dir);
 	if (!inode)
@@ -305,7 +310,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	error = affs_add_entry(dir, inode, dentry, ST_USERDIR);
 	if (error) {
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		mark_inode_dirty(inode);
 		iput(inode);
 		return error;
@@ -316,8 +321,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 int
 affs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
-		 dentry->d_inode->i_ino,
+	pr_debug("%s(dir=%u, %lu \"%.*s\")\n",
+		__func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
 	return affs_remove_header(dentry);
@@ -333,8 +338,9 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	int			 i, maxlen, error;
 	char			 c, lc;
 
-	pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino,
-		 (int)dentry->d_name.len,dentry->d_name.name,symname);
+	pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n",
+		 __func__, dir->i_ino, (int)dentry->d_name.len,
+		 dentry->d_name.name, symname);
 
 	maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1;
 	inode  = affs_new_inode(dir);
@@ -392,7 +398,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	return 0;
 
 err:
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	iput(inode);
 	return error;
@@ -403,7 +409,8 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
 
-	pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino,
+	pr_debug("%s(%u, %u, \"%.*s\")\n",
+		 __func__, (u32)inode->i_ino, (u32)dir->i_ino,
 		 (int)dentry->d_name.len,dentry->d_name.name);
 
 	return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -417,11 +424,15 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh = NULL;
 	int retval;
 
-	pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
-		 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
-		 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
+	pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
+		 __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len,
+		 old_dentry->d_name.name, (u32)new_dir->i_ino,
+		(int)new_dentry->d_name.len, new_dentry->d_name.name);
+
+	retval = affs_check_name(new_dentry->d_name.name,
+				 new_dentry->d_name.len,
+				 affs_nofilenametruncate(old_dentry));
 
-	retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len);
 	if (retval)
 		return retval;
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index b31507d0f9b..51f1a95bff7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/writeback.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
@@ -25,15 +26,17 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
-affs_commit_super(struct super_block *sb, int wait, int clean)
+affs_commit_super(struct super_block *sb, int wait)
 {
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	struct buffer_head *bh = sbi->s_root_bh;
 	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
 
-	tail->bm_flag = cpu_to_be32(clean);
+	lock_buffer(bh);
 	secs_to_datestamp(get_seconds(), &tail->disk_change);
 	affs_fix_checksum(sb, bh);
+	unlock_buffer(bh);
+
 	mark_buffer_dirty(bh);
 	if (wait)
 		sync_dirty_buffer(bh);
@@ -43,38 +46,48 @@ static void
 affs_put_super(struct super_block *sb)
 {
 	struct affs_sb_info *sbi = AFFS_SB(sb);
-	pr_debug("AFFS: put_super()\n");
+	pr_debug("%s()\n", __func__);
 
-	if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
-		affs_commit_super(sb, 1, 1);
+	cancel_delayed_work_sync(&sbi->sb_work);
+}
 
-	kfree(sbi->s_prefix);
-	affs_free_bitmap(sb);
-	affs_brelse(sbi->s_root_bh);
-	kfree(sbi);
-	sb->s_fs_info = NULL;
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+	affs_commit_super(sb, wait);
+	return 0;
 }
 
-static void
-affs_write_super(struct super_block *sb)
+static void flush_superblock(struct work_struct *work)
 {
-	lock_super(sb);
-	if (!(sb->s_flags & MS_RDONLY))
-		affs_commit_super(sb, 1, 2);
-	sb->s_dirt = 0;
-	unlock_super(sb);
+	struct affs_sb_info *sbi;
+	struct super_block *sb;
+
+	sbi = container_of(work, struct affs_sb_info, sb_work.work);
+	sb = sbi->sb;
+
+	spin_lock(&sbi->work_lock);
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->work_lock);
 
-	pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
+	affs_commit_super(sb, 1);
 }
 
-static int
-affs_sync_fs(struct super_block *sb, int wait)
+void affs_mark_sb_dirty(struct super_block *sb)
 {
-	lock_super(sb);
-	affs_commit_super(sb, wait, 2);
-	sb->s_dirt = 0;
-	unlock_super(sb);
-	return 0;
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	unsigned long delay;
+
+	if (sb->s_flags & MS_RDONLY)
+	       return;
+
+	spin_lock(&sbi->work_lock);
+	if (!sbi->work_queued) {
+	       delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+	       queue_delayed_work(system_long_wq, &sbi->sb_work, delay);
+	       sbi->work_queued = 1;
+	}
+	spin_unlock(&sbi->work_lock);
 }
 
 static struct kmem_cache * affs_inode_cachep;
@@ -98,7 +111,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
 static void affs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
@@ -116,7 +128,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
 	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
 					     sizeof(struct affs_inode_info),
@@ -130,6 +142,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(affs_inode_cachep);
 }
 
@@ -139,7 +156,6 @@ static const struct super_operations affs_sops = {
 	.write_inode	= affs_write_inode,
 	.evict_inode	= affs_evict_inode,
 	.put_super	= affs_put_super,
-	.write_super	= affs_write_super,
 	.sync_fs	= affs_sync_fs,
 	.statfs		= affs_statfs,
 	.remount_fs	= affs_remount,
@@ -147,7 +163,7 @@ static const struct super_operations affs_sops = {
 };
 
 enum {
-	Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect,
+	Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
 	Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
 	Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
 };
@@ -156,6 +172,7 @@ static const match_table_t tokens = {
 	{Opt_bs, "bs=%u"},
 	{Opt_mode, "mode=%o"},
 	{Opt_mufs, "mufs"},
+	{Opt_notruncate, "nofilenametruncate"},
 	{Opt_prefix, "prefix=%s"},
 	{Opt_protect, "protect"},
 	{Opt_reserved, "reserved=%u"},
@@ -172,7 +189,7 @@ static const match_table_t tokens = {
 };
 
 static int
-parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root,
+parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
 		int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
 {
 	char *p;
@@ -203,7 +220,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
 				return 0;
 			if (n != 512 && n != 1024 && n != 2048
 			    && n != 4096) {
-				printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+				pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
 				return 0;
 			}
 			*blocksize = n;
@@ -217,6 +234,9 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
 		case Opt_mufs:
 			*mount_opts |= SF_MUFS;
 			break;
+		case Opt_notruncate:
+			*mount_opts |= SF_NO_TRUNCATE;
+			break;
 		case Opt_prefix:
 			*prefix = match_strdup(&args[0]);
 			if (!*prefix)
@@ -237,13 +257,17 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
 		case Opt_setgid:
 			if (match_int(&args[0], &option))
 				return 0;
-			*gid = option;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 0;
 			*mount_opts |= SF_SETGID;
 			break;
 		case Opt_setuid:
 			if (match_int(&args[0], &option))
 				return 0;
-			*uid = option;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 0;
 			*mount_opts |= SF_SETUID;
 			break;
 		case Opt_verbose:
@@ -261,8 +285,8 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
 		 	/* Silently ignore the quota options */
 			break;
 		default:
-			printk("AFFS: Unrecognized mount option \"%s\" "
-					"or missing value\n", p);
+			pr_warn("Unrecognized mount option \"%s\" or missing value\n",
+				p);
 			return 0;
 		}
 	}
@@ -285,17 +309,17 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	int			 num_bm;
 	int			 i, j;
 	s32			 key;
-	uid_t			 uid;
-	gid_t			 gid;
+	kuid_t			 uid;
+	kgid_t			 gid;
 	int			 reserved;
 	unsigned long		 mount_flags;
 	int			 tmp_flags;	/* fix remount prototype... */
 	u8			 sig[4];
-	int			 ret = -EINVAL;
+	int			 ret;
 
 	save_mount_options(sb, data);
 
-	pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
+	pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
 
 	sb->s_magic             = AFFS_SUPER_MAGIC;
 	sb->s_op                = &affs_sops;
@@ -306,15 +330,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->sb = sb;
 	mutex_init(&sbi->s_bmlock);
 	spin_lock_init(&sbi->symlink_lock);
+	spin_lock_init(&sbi->work_lock);
+	INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
 
 	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
 				&blocksize,&sbi->s_prefix,
 				sbi->s_volume, &mount_flags)) {
-		printk(KERN_ERR "AFFS: Error parsing options\n");
-		kfree(sbi->s_prefix);
-		kfree(sbi);
+		pr_err("Error parsing options\n");
 		return -EINVAL;
 	}
 	/* N.B. after this point s_prefix must be released */
@@ -331,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 
 	size = sb->s_bdev->bd_inode->i_size >> 9;
-	pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size);
+	pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
 
 	affs_set_blocksize(sb, PAGE_SIZE);
 	/* Try to find root block. Its location depends on the block size. */
@@ -346,7 +371,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 		sbi->s_root_block = root_block;
 		if (root_block < 0)
 			sbi->s_root_block = (reserved + size - 1) / 2;
-		pr_debug("AFFS: setting blocksize to %d\n", blocksize);
+		pr_debug("setting blocksize to %d\n", blocksize);
 		affs_set_blocksize(sb, blocksize);
 		sbi->s_partition_size = size;
 
@@ -361,7 +386,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 		 * block behind the calculated one. So we check this one, too.
 		 */
 		for (num_bm = 0; num_bm < 2; num_bm++) {
-			pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, "
+			pr_debug("Dev %s, trying root=%u, bs=%d, "
 				"size=%d, reserved=%d\n",
 				sb->s_id,
 				sbi->s_root_block + num_bm,
@@ -382,19 +407,20 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 	if (!silent)
-		printk(KERN_ERR "AFFS: No valid root block on device %s\n",
-			sb->s_id);
-	goto out_error;
+		pr_err("No valid root block on device %s\n", sb->s_id);
+	return -EINVAL;
 
 	/* N.B. after this point bh must be released */
 got_root:
+	/* Keep super block in cache */
+	sbi->s_root_bh = root_bh;
 	root_block = sbi->s_root_block;
 
 	/* Find out which kind of FS we have */
 	boot_bh = sb_bread(sb, 0);
 	if (!boot_bh) {
-		printk(KERN_ERR "AFFS: Cannot read boot block\n");
-		goto out_error;
+		pr_err("Cannot read boot block\n");
+		return -EINVAL;
 	}
 	memcpy(sig, boot_bh->b_data, 4);
 	brelse(boot_bh);
@@ -406,8 +432,7 @@ got_root:
 	 */
 	if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
 	     || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) {
-		printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n",
-			sb->s_id);
+		pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
 		sb->s_flags |= MS_RDONLY;
 	}
 	switch (chksum) {
@@ -441,14 +466,14 @@ got_root:
 			sb->s_flags |= MS_NOEXEC;
 			break;
 		default:
-			printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
-				sb->s_id, chksum);
-			goto out_error;
+			pr_err("Unknown filesystem on device %s: %08X\n",
+			       sb->s_id, chksum);
+			return -EINVAL;
 	}
 
 	if (mount_flags & SF_VERBOSE) {
 		u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
-		printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+		pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
 			len > 31 ? 31 : len,
 			AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
 			sig, sig[3] + '0', blocksize);
@@ -460,50 +485,31 @@ got_root:
 	if (sbi->s_flags & SF_OFS)
 		sbi->s_data_blksize -= 24;
 
-	/* Keep super block in cache */
-	sbi->s_root_bh = root_bh;
-	/* N.B. after this point s_root_bh must be released */
-
 	tmp_flags = sb->s_flags;
-	if (affs_init_bitmap(sb, &tmp_flags))
-		goto out_error;
+	ret = affs_init_bitmap(sb, &tmp_flags);
+	if (ret)
+		return ret;
 	sb->s_flags = tmp_flags;
 
 	/* set up enough so that it can read an inode */
 
 	root_inode = affs_iget(sb, root_block);
-	if (IS_ERR(root_inode)) {
-		ret = PTR_ERR(root_inode);
-		goto out_error_noinode;
-	}
+	if (IS_ERR(root_inode))
+		return PTR_ERR(root_inode);
 
 	if (AFFS_SB(sb)->s_flags & SF_INTL)
 		sb->s_d_op = &affs_intl_dentry_operations;
 	else
 		sb->s_d_op = &affs_dentry_operations;
 
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root) {
-		printk(KERN_ERR "AFFS: Get root inode failed\n");
-		goto out_error;
+		pr_err("AFFS: Get root inode failed\n");
+		return -ENOMEM;
 	}
 
-	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
+	pr_debug("s_flags=%lX\n", sb->s_flags);
 	return 0;
-
-	/*
-	 * Begin the cascaded cleanup ...
-	 */
-out_error:
-	if (root_inode)
-		iput(root_inode);
-out_error_noinode:
-	kfree(sbi->s_bitmap);
-	affs_brelse(root_bh);
-	kfree(sbi->s_prefix);
-	kfree(sbi);
-	sb->s_fs_info = NULL;
-	return ret;
 }
 
 static int
@@ -511,8 +517,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct affs_sb_info	*sbi = AFFS_SB(sb);
 	int			 blocksize;
-	uid_t			 uid;
-	gid_t			 gid;
+	kuid_t			 uid;
+	kgid_t			 gid;
 	int			 mode;
 	int			 reserved;
 	int			 root_block;
@@ -522,8 +528,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	char			 volume[32];
 	char			*prefix = NULL;
 
-	pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
+	pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
 
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 
 	memcpy(volume, sbi->s_volume, 32);
@@ -535,6 +542,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 	}
 
+	flush_delayed_work(&sbi->sb_work);
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -553,10 +561,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 
-	if (*flags & MS_RDONLY) {
-		affs_write_super(sb);
+	if (*flags & MS_RDONLY)
 		affs_free_bitmap(sb);
-	} else
+	else
 		res = affs_init_bitmap(sb, flags);
 
 	return res;
@@ -569,8 +576,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	int		 free;
 	u64		 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
-	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
-	     AFFS_SB(sb)->s_reserved);
+	pr_debug("%s() partsize=%d, reserved=%d\n",
+		 __func__, AFFS_SB(sb)->s_partition_size,
+		 AFFS_SB(sb)->s_reserved);
 
 	free          = affs_count_free_blocks(sb);
 	buf->f_type    = AFFS_SUPER_MAGIC;
@@ -590,13 +598,26 @@ static struct dentry *affs_mount(struct file_system_type *fs_type,
 	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
 }
 
+static void affs_kill_sb(struct super_block *sb)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	kill_block_super(sb);
+	if (sbi) {
+		affs_free_bitmap(sb);
+		affs_brelse(sbi->s_root_bh);
+		kfree(sbi->s_prefix);
+		kfree(sbi);
+	}
+}
+
 static struct file_system_type affs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "affs",
 	.mount		= affs_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= affs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("affs");
 
 static int __init init_affs_fs(void)
 {
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ee00f08c4f5..f39b71c3981 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -21,7 +21,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 	char			 c;
 	char			 lc;
 
-	pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
+	pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
 
 	err = -EIO;
 	bh = affs_bread(inode->i_sb, inode->i_ino);
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 8f975f25b48..ebba3b18e5d 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -1,6 +1,6 @@
 config AFS_FS
-	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
+	tristate "Andrew File System support (AFS)"
+	depends on INET
 	select AF_RXRPC
 	select DNS_RESOLVER
 	help
@@ -22,8 +22,7 @@ config AFS_DEBUG
 	  If unsure, say N.
 
 config AFS_FSCACHE
-	bool "Provide AFS client caching support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Provide AFS client caching support"
 	depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
 	help
 	  Say Y here if you want AFS data to be cached locally on disk through
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index c548aa346f0..3c462ff6db6 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -119,8 +119,8 @@ struct afs_file_status {
 	u64			size;		/* file size */
 	afs_dataversion_t	data_version;	/* current data version */
 	u32			author;		/* author ID */
-	u32			owner;		/* owner ID */
-	u32			group;		/* group ID */
+	kuid_t			owner;		/* owner ID */
+	kgid_t			group;		/* group ID */
 	afs_access_t		caller_access;	/* access rights for authenticated caller */
 	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
 	umode_t			mode;		/* UNIX mode */
@@ -133,13 +133,6 @@ struct afs_file_status {
 /*
  * AFS file status change request
  */
-struct afs_store_status {
-	u32			mask;		/* which bits of the struct are set */
-	u32			mtime_client;	/* last time client changed data */
-	u32			owner;		/* owner ID */
-	u32			group;		/* group ID */
-	umode_t			mode;		/* UNIX mode */
-};
 
 #define AFS_SET_MTIME		0x01		/* set the mtime */
 #define AFS_SET_OWNER		0x02		/* set the owner ID */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index 8bbefe009ed..800f607ffaf 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -49,7 +49,7 @@ enum AFSVL_Errors {
 	AFSVL_BADVOLOPER 	= 363542,	/* Bad volume operation code */
 	AFSVL_BADRELLOCKTYPE 	= 363543,	/* Bad release lock type */
 	AFSVL_RERELEASE 	= 363544,	/* Status report: last release was aborted */
-	AFSVL_BADSERVERFLAG 	= 363545,	/* Invalid replication site server °ag */
+	AFSVL_BADSERVERFLAG 	= 363545,	/* Invalid replication site server flag */
 	AFSVL_PERM 		= 363546,	/* No permission access */
 	AFSVL_NOMEM 		= 363547,	/* malloc/realloc failed to alloc enough memory */
 };
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 0fb315dd4d2..577763c3d88 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -98,7 +98,7 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
 }
 
 /*
- * provide new auxilliary cache data
+ * provide new auxiliary cache data
  */
 static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
 				       void *buffer, uint16_t bufmax)
@@ -117,7 +117,7 @@ static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
 }
 
 /*
- * check that the auxilliary data indicates that the entry is still valid
+ * check that the auxiliary data indicates that the entry is still valid
  */
 static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
 						      const void *buffer,
@@ -150,7 +150,7 @@ static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
 }
 
 /*
- * provide new auxilliary cache data
+ * provide new auxiliary cache data
  */
 static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
 					    void *buffer, uint16_t bufmax)
@@ -172,7 +172,7 @@ static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
 }
 
 /*
- * check that the auxilliary data indicates that the entry is still valid
+ * check that the auxiliary data indicates that the entry is still valid
  */
 static
 enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
@@ -283,7 +283,7 @@ static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
 }
 
 /*
- * provide new auxilliary cache data
+ * provide new auxiliary cache data
  */
 static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 					void *buffer, uint16_t bufmax)
@@ -309,7 +309,7 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 }
 
 /*
- * check that the auxilliary data indicates that the entry is still valid
+ * check that the auxiliary data indicates that the entry is still valid
  */
 static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 						       const void *buffer,
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 587ef5123cd..7ef637d7f3a 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -351,9 +351,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work)
  */
 void afs_flush_callback_breaks(struct afs_server *server)
 {
-	cancel_delayed_work(&server->cb_break_work);
-	queue_delayed_work(afs_callback_update_worker,
-			   &server->cb_break_work, 0);
+	mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
 }
 
 #if 0
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 0d5eeadf612..ca0a3cf9379 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
 	/* put it up for caching (this never returns an error) */
 	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
 					     &afs_cell_cache_index_def,
-					     cell);
+					     cell, true);
 #endif
 
 	/* add to the cell lists */
@@ -293,7 +293,7 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
 		if (!cell) {
 			/* this should not happen unless user tries to mount
 			 * when root cell is not set. Return an impossibly
-			 * bizzare errno to alert the user. Things like
+			 * bizarre errno to alert the user. Things like
 			 * ENOENT might be "more appropriate" but they happen
 			 * for other reasons.
 			 */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 1c8c6cc6de3..4b0eff6da67 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call)
 {
 	_enter("");
 
+	/* Break the callbacks here so that we do it after the final ACK is
+	 * received.  The step number here must match the final number in
+	 * afs_deliver_cb_callback().
+	 */
+	if (call->unmarshall == 6) {
+		ASSERT(call->server && call->count && call->request);
+		afs_break_callbacks(call->server, call->count, call->request);
+	}
+
 	afs_put_server(call->server);
 	call->server = NULL;
 	kfree(call->buffer);
@@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		_debug("trailer");
 		if (skb->len != 0)
 			return -EBADMSG;
+
+		/* Record that the message was unmarshalled successfully so
+		 * that the call destructor can know do the callback breaking
+		 * work, even if the final ACK isn't received.
+		 *
+		 * If the step number changes, then afs_cm_destructor() must be
+		 * updated also.
+		 */
+		call->unmarshall++;
+	case 6:
 		break;
 	}
 
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f2492..529300327f4 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -20,17 +20,17 @@
 #include "internal.h"
 
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd);
+				 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
-static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
+static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, u64 ino, unsigned dtype);
-static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
-		      struct nameidata *nd);
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static int afs_rmdir(struct inode *dir, struct dentry *dentry);
 static int afs_unlink(struct inode *dir, struct dentry *dentry);
 static int afs_link(struct dentry *from, struct inode *dir,
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
 	.release	= afs_release,
-	.readdir	= afs_readdir,
+	.iterate	= afs_readdir,
 	.lock		= afs_lock,
 	.llseek		= generic_file_llseek,
 };
@@ -119,9 +119,9 @@ struct afs_dir_page {
 };
 
 struct afs_lookup_cookie {
+	struct dir_context ctx;
 	struct afs_fid	fid;
-	const char	*name;
-	size_t		nlen;
+	struct qstr name;
 	int		found;
 };
 
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 /*
  * deal with one block in an AFS directory
  */
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
 				 union afs_dir_block *block,
-				 unsigned blkoff,
-				 void *cookie,
-				 filldir_t filldir)
+				 unsigned blkoff)
 {
 	union afs_dirent *dire;
 	unsigned offset, next, curr;
 	size_t nlen;
-	int tmp, ret;
+	int tmp;
 
-	_enter("%u,%x,%p,,",*fpos,blkoff,block);
+	_enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
 
-	curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+	curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
 
 	/* walk through the block, an entry at a time */
 	for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			_debug("ENT[%Zu.%u]: unused",
 			       blkoff / sizeof(union afs_dir_block), offset);
 			if (offset >= curr)
-				*fpos = blkoff +
+				ctx->pos = blkoff +
 					next * sizeof(union afs_dirent);
 			continue;
 		}
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			continue;
 
 		/* found the next entry */
-		ret = filldir(cookie,
-			      dire->u.name,
-			      nlen,
-			      blkoff + offset * sizeof(union afs_dirent),
+		if (!dir_emit(ctx, dire->u.name, nlen,
 			      ntohl(dire->u.vnode),
-			      filldir == afs_lookup_filldir ?
-			      ntohl(dire->u.unique) : DT_UNKNOWN);
-		if (ret < 0) {
+			      ctx->actor == afs_lookup_filldir ?
+			      ntohl(dire->u.unique) : DT_UNKNOWN)) {
 			_leave(" = 0 [full]");
 			return 0;
 		}
 
-		*fpos = blkoff + next * sizeof(union afs_dirent);
+		ctx->pos = blkoff + next * sizeof(union afs_dirent);
 	}
 
 	_leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
 /*
  * iterate through the data blob that lists the contents of an AFS directory
  */
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
-			   filldir_t filldir, struct key *key)
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+			   struct key *key)
 {
 	union afs_dir_block *dblock;
 	struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 	unsigned blkoff, limit;
 	int ret;
 
-	_enter("{%lu},%u,,", dir->i_ino, *fpos);
+	_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
 
 	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
 		_leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 	}
 
 	/* round the file position up to the next entry boundary */
-	*fpos += sizeof(union afs_dirent) - 1;
-	*fpos &= ~(sizeof(union afs_dirent) - 1);
+	ctx->pos += sizeof(union afs_dirent) - 1;
+	ctx->pos &= ~(sizeof(union afs_dirent) - 1);
 
 	/* walk through the blocks in sequence */
 	ret = 0;
-	while (*fpos < dir->i_size) {
-		blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+	while (ctx->pos < dir->i_size) {
+		blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
 
 		/* fetch the appropriate page from the directory */
 		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 		do {
 			dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
 					       sizeof(union afs_dir_block)];
-			ret = afs_dir_iterate_block(fpos, dblock, blkoff,
-						    cookie, filldir);
+			ret = afs_dir_iterate_block(ctx, dblock, blkoff);
 			if (ret != 1) {
 				afs_dir_put_page(page);
 				goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 
 			blkoff += sizeof(union afs_dir_block);
 
-		} while (*fpos < dir->i_size && blkoff < limit);
+		} while (ctx->pos < dir->i_size && blkoff < limit);
 
 		afs_dir_put_page(page);
 		ret = 0;
@@ -387,23 +380,10 @@ out:
 /*
  * read an AFS directory
  */
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned fpos;
-	int ret;
-
-	_enter("{%Ld,{%lu}}",
-	       file->f_pos, file->f_path.dentry->d_inode->i_ino);
-
-	ASSERT(file->private_data != NULL);
-
-	fpos = file->f_pos;
-	ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
-			      cookie, filldir, file->private_data);
-	file->f_pos = fpos;
-
-	_leave(" = %d", ret);
-	return ret;
+	return afs_dir_iterate(file_inode(file), 
+			      ctx, file->private_data);
 }
 
 /*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 {
 	struct afs_lookup_cookie *cookie = _cookie;
 
-	_enter("{%s,%Zu},%s,%u,,%llu,%u",
-	       cookie->name, cookie->nlen, name, nlen,
+	_enter("{%s,%u},%s,%u,,%llu,%u",
+	       cookie->name.name, cookie->name.len, name, nlen,
 	       (unsigned long long) ino, dtype);
 
 	/* insanity checks first */
 	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
 	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
-	if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+	if (cookie->name.len != nlen ||
+	    memcmp(cookie->name.name, name, nlen) != 0) {
 		_leave(" = 0 [no]");
 		return 0;
 	}
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
 			 struct afs_fid *fid, struct key *key)
 {
-	struct afs_lookup_cookie cookie;
-	struct afs_super_info *as;
-	unsigned fpos;
+	struct afs_super_info *as = dir->i_sb->s_fs_info;
+	struct afs_lookup_cookie cookie = {
+		.ctx.actor = afs_lookup_filldir,
+		.name = dentry->d_name,
+		.fid.vid = as->volume->vid
+	};
 	int ret;
 
 	_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
 
-	as = dir->i_sb->s_fs_info;
-
 	/* search the directory */
-	cookie.name	= dentry->d_name.name;
-	cookie.nlen	= dentry->d_name.len;
-	cookie.fid.vid	= as->volume->vid;
-	cookie.found	= 0;
-
-	fpos = 0;
-	ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
-			      key);
+	ret = afs_dir_iterate(dir, &cookie.ctx, key);
 	if (ret < 0) {
 		_leave(" = %d [iter]", ret);
 		return ret;
@@ -516,7 +491,7 @@ out:
  * look up an entry in a directory
  */
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd)
+				 unsigned int flags)
 {
 	struct afs_vnode *vnode;
 	struct afs_fid fid;
@@ -584,11 +559,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 
 success:
 	d_add(dentry, inode);
-	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
+	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }",
 	       fid.vnode,
 	       fid.unique,
 	       dentry->d_inode->i_ino,
-	       (unsigned long long)dentry->d_inode->i_version);
+	       dentry->d_inode->i_generation);
 
 	return NULL;
 }
@@ -598,7 +573,7 @@ success:
  * - NOTE! the hit can be a negative hit too, so we can't assume we have an
  *   inode
  */
-static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct afs_vnode *vnode, *dir;
 	struct afs_fid uninitialized_var(fid);
@@ -607,7 +582,7 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 	void *dir_version;
 	int ret;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	vnode = AFS_FS_I(dentry->d_inode);
@@ -625,9 +600,6 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 	/* lock down the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
-	if (!parent->d_inode)
-		goto out_bad;
-
 	dir = AFS_FS_I(parent->d_inode);
 
 	/* validate the parent directory */
@@ -671,10 +643,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * been deleted and replaced, and the original vnode ID has
 		 * been reused */
 		if (fid.unique != vnode->fid.unique) {
-			_debug("%s: file deleted (uq %u -> %u I:%llu)",
+			_debug("%s: file deleted (uq %u -> %u I:%u)",
 			       dentry->d_name.name, fid.unique,
 			       vnode->fid.unique,
-			       (unsigned long long)dentry->d_inode->i_version);
+			       dentry->d_inode->i_generation);
 			spin_lock(&vnode->lock);
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
 			spin_unlock(&vnode->lock);
@@ -710,16 +682,12 @@ not_found:
 	spin_unlock(&dentry->d_lock);
 
 out_bad:
-	if (dentry->d_inode) {
-		/* don't unhash if we have submounts */
-		if (have_submounts(dentry))
-			goto out_skip;
-	}
+	/* don't unhash if we have submounts */
+	if (check_submounts_and_drop(dentry) != 0)
+		goto out_skip;
 
 	_debug("dropping dentry %s/%s",
 	       parent->d_name.name, dentry->d_name.name);
-	shrink_dcache_parent(dentry);
-	d_drop(dentry);
 	dput(parent);
 	key_put(key);
 
@@ -764,7 +732,7 @@ static void afs_d_release(struct dentry *dentry)
 /*
  * create a directory on an AFS filesystem
  */
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct afs_file_status status;
 	struct afs_callback cb;
@@ -777,13 +745,9 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%u},{%s},%o",
+	_enter("{%x:%u},{%s},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
 
-	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
-
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
@@ -845,10 +809,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	_enter("{%x:%u},{%s}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
 
-	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
-
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
@@ -948,8 +908,8 @@ error:
 /*
  * create a regular file on an AFS filesystem
  */
-static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
-		      struct nameidata *nd)
+static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
 {
 	struct afs_file_status status;
 	struct afs_callback cb;
@@ -962,13 +922,9 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%u},{%s},%o,",
+	_enter("{%x:%u},{%s},%ho,",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
 
-	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
-
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
@@ -1034,10 +990,6 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	       dvnode->fid.vid, dvnode->fid.vnode,
 	       dentry->d_name.name);
 
-	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
-
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
@@ -1082,10 +1034,6 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
 	       content);
 
-	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
-
 	ret = -EINVAL;
 	if (strlen(content) >= AFSPATHMAX)
 		goto error;
@@ -1156,10 +1104,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	       new_dvnode->fid.vid, new_dvnode->fid.vnode,
 	       new_dentry->d_name.name);
 
-	ret = -ENAMETOOLONG;
-	if (new_dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
-
 	key = afs_request_key(orig_dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 14d89fa58fe..932ce07948b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
 
@@ -30,10 +31,10 @@ const struct file_operations afs_file_operations = {
 	.open		= afs_open,
 	.release	= afs_release,
 	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read	= generic_file_aio_read,
-	.aio_write	= afs_file_write,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= afs_file_write,
 	.mmap		= generic_file_readonly_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= afs_fsync,
@@ -251,7 +252,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 	ASSERT(key != NULL);
 
 	vnode = AFS_FS_I(mapping->host);
-	if (vnode->flags & AFS_VNODE_DELETED) {
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		_leave(" = -ESTALE");
 		return -ESTALE;
 	}
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
  * - release a page and clean up its private data if offset is 0 (indicating
  *   the entire page)
  */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length)
 {
 	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
 
-	_enter("{%lu},%lu", page->index, offset);
+	_enter("{%lu},%u,%u", page->index, offset, length);
 
 	BUG_ON(!PageLocked(page));
 
 	/* we clean up only if the entire page is being invalidated */
-	if (offset == 0) {
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
 #ifdef CONFIG_AFS_FSCACHE
 		if (PageFsCache(page)) {
 			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 757d664575d..4baf1d2b39e 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
  */
 static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct inode *inode = file_inode(file);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_lock_type_t type;
 	struct key *key = file->private_data;
 	int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 
 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
 	afs_vnode_fetch_status(vnode, NULL, key);
 
 error:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	_leave(" = %d", ret);
 	return ret;
 
@@ -514,7 +515,7 @@ error:
  */
 int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 
 	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -537,7 +538,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
  */
 int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 
 	_enter("{%x:%u},%d,{t=%x,fl=%x}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -554,10 +555,6 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 		return -ENOLCK;
 
 	/* we're simulating flock() locks using posix locks on the server */
-	fl->fl_owner = (fl_owner_t) file;
-	fl->fl_start = 0;
-	fl->fl_end = OFFSET_MAX;
-
 	if (fl->fl_type == F_UNLCK)
 		return afs_do_unlk(file, fl);
 	return afs_do_setlk(file, fl);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 4bd0218473a..c2e930ec288 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	umode_t mode;
 	u64 data_version, size;
 	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+	kuid_t owner;
+	kgid_t group;
 
 #define EXTRACT(DST)				\
 	do {					\
@@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	size = ntohl(*bp++);
 	data_version = ntohl(*bp++);
 	EXTRACT(status->author);
-	EXTRACT(status->owner);
+	owner = make_kuid(&init_user_ns, ntohl(*bp++));
+	changed |= !uid_eq(owner, status->owner);
+	status->owner = owner;
 	EXTRACT(status->caller_access); /* call ticket dependent */
 	EXTRACT(status->anon_access);
 	EXTRACT(status->mode);
@@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	bp++; /* seg size */
 	status->mtime_client = ntohl(*bp++);
 	status->mtime_server = ntohl(*bp++);
-	EXTRACT(status->group);
+	group = make_kgid(&init_user_ns, ntohl(*bp++));
+	changed |= !gid_eq(group, status->group);
+	status->group = group;
 	bp++; /* sync counter */
 	data_version |= (u64) ntohl(*bp++) << 32;
 	EXTRACT(status->lock_count);
@@ -89,8 +95,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 			i_size_write(&vnode->vfs_inode, size);
 			vnode->vfs_inode.i_uid = status->owner;
 			vnode->vfs_inode.i_gid = status->group;
-			vnode->vfs_inode.i_version = vnode->fid.unique;
-			vnode->vfs_inode.i_nlink = status->nlink;
+			vnode->vfs_inode.i_generation = vnode->fid.unique;
+			set_nlink(&vnode->vfs_inode, status->nlink);
 
 			mode = vnode->vfs_inode.i_mode;
 			mode &= ~S_IALLUGO;
@@ -102,6 +108,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 		vnode->vfs_inode.i_ctime.tv_sec	= status->mtime_server;
 		vnode->vfs_inode.i_mtime	= vnode->vfs_inode.i_ctime;
 		vnode->vfs_inode.i_atime	= vnode->vfs_inode.i_ctime;
+		vnode->vfs_inode.i_version	= data_version;
 	}
 
 	expected_version = status->data_version;
@@ -180,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_UID) {
 		mask |= AFS_SET_OWNER;
-		owner = attr->ia_uid;
+		owner = from_kuid(&init_user_ns, attr->ia_uid);
 	}
 
 	if (attr->ia_valid & ATTR_GID) {
 		mask |= AFS_SET_GROUP;
-		group = attr->ia_gid;
+		group = from_kgid(&init_user_ns, attr->ia_gid);
 	}
 
 	if (attr->ia_valid & ATTR_MODE) {
@@ -364,10 +371,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		_debug("extract data");
 		if (call->count > 0) {
 			page = call->reply3;
-			buffer = kmap_atomic(page, KM_USER0);
+			buffer = kmap_atomic(page);
 			ret = afs_extract_data(call, skb, last, buffer,
 					       call->count);
-			kunmap_atomic(buffer, KM_USER0);
+			kunmap_atomic(buffer);
 			switch (ret) {
 			case 0:		break;
 			case -EAGAIN:	return 0;
@@ -410,9 +417,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	if (call->count < PAGE_SIZE) {
 		_debug("clear");
 		page = call->reply3;
-		buffer = kmap_atomic(page, KM_USER0);
+		buffer = kmap_atomic(page);
 		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
-		kunmap_atomic(buffer, KM_USER0);
+		kunmap_atomic(buffer);
 	}
 
 	_leave(" = 0 [done]");
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index db66c520147..29467128844 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -67,15 +67,16 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		fscache_attr_changed(vnode->cache);
 #endif
 
-	inode->i_nlink		= vnode->status.nlink;
+	set_nlink(inode, vnode->status.nlink);
 	inode->i_uid		= vnode->status.owner;
-	inode->i_gid		= 0;
+	inode->i_gid		= GLOBAL_ROOT_GID;
 	inode->i_size		= vnode->status.size;
 	inode->i_ctime.tv_sec	= vnode->status.mtime_server;
 	inode->i_ctime.tv_nsec	= 0;
 	inode->i_atime		= inode->i_mtime = inode->i_ctime;
 	inode->i_blocks		= 0;
-	inode->i_version	= vnode->fid.unique;
+	inode->i_generation	= vnode->fid.unique;
+	inode->i_version	= vnode->status.data_version;
 	inode->i_mapping->a_ops	= &afs_fs_aops;
 
 	/* check to see whether a symbolic link is really a mountpoint */
@@ -100,7 +101,7 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
 	struct afs_iget_data *data = opaque;
 
 	return inode->i_ino == data->fid.vnode &&
-		inode->i_version == data->fid.unique;
+		inode->i_generation == data->fid.unique;
 }
 
 /*
@@ -122,7 +123,7 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
 	inode->i_ino = data->fid.vnode;
-	inode->i_version = data->fid.unique;
+	inode->i_generation = data->fid.unique;
 	vnode->fid = data->fid;
 	vnode->volume = data->volume;
 
@@ -173,9 +174,9 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
 	inode->i_size		= 0;
 	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
 	inode->i_op		= &afs_autocell_inode_operations;
-	inode->i_nlink		= 2;
-	inode->i_uid		= 0;
-	inode->i_gid		= 0;
+	set_nlink(inode, 2);
+	inode->i_uid		= GLOBAL_ROOT_UID;
+	inode->i_gid		= GLOBAL_ROOT_GID;
 	inode->i_ctime.tv_sec	= get_seconds();
 	inode->i_ctime.tv_nsec	= 0;
 	inode->i_atime		= inode->i_mtime = inode->i_ctime;
@@ -258,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 #ifdef CONFIG_AFS_FSCACHE
 	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
 					      &afs_vnode_cache_index_def,
-					      vnode);
+					      vnode, true);
 #endif
 
 	ret = afs_inode_map_status(vnode, key);
@@ -380,8 +381,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	inode = dentry->d_inode;
 
-	_enter("{ ino=%lu v=%llu }", inode->i_ino,
-		(unsigned long long)inode->i_version);
+	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
 
 	generic_fillattr(inode, stat);
 	return 0;
@@ -422,8 +422,8 @@ void afs_evict_inode(struct inode *inode)
 
 	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
 
-	truncate_inode_pages(&inode->i_data, 0);
-	end_writeback(inode);
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
 
 	afs_give_up_callback(vnode);
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5a9b6843bac..71d5982312f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,6 +75,7 @@ struct afs_call {
 	const struct afs_call_type *type;	/* type of call */
 	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
 	wait_queue_head_t	waitq;		/* processes awaiting completion */
+	void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
 	struct work_struct	async_work;	/* asynchronous work processor */
 	struct work_struct	work;		/* actual work processor */
 	struct sk_buff_head	rx_queue;	/* received packets */
@@ -109,7 +110,7 @@ struct afs_call {
 	unsigned		reply_size;	/* current size of reply */
 	unsigned		first_offset;	/* offset into mapping[first] */
 	unsigned		last_to;	/* amount of mapping[last] */
-	unsigned short		offset;		/* offset into received data store */
+	unsigned		offset;		/* offset into received data store */
 	unsigned char		unmarshall;	/* unmarshalling phase */
 	bool			incoming;	/* T if incoming call */
 	bool			send_pages;	/* T if data from mapping should be sent */
@@ -195,7 +196,6 @@ struct afs_cell {
 	struct list_head	link;		/* main cell list link */
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
 	struct list_head	proc_link;	/* /proc cell list link */
-	struct proc_dir_entry	*proc_dir;	/* /proc dir for this cell */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
@@ -627,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int, unsigned int);
+extern int afs_permission(struct inode *, int);
 
 /*
  * server.c
@@ -747,10 +747,9 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
 extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
-extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
-			      unsigned long, loff_t);
+extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, int);
+extern int afs_fsync(struct file *, loff_t, loff_t, int);
 
 
 /*****************************************************************************/
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 42dd2e499ed..35de0c04729 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void)
 	afs_uuid.time_low = uuidtime;
 	afs_uuid.time_mid = uuidtime >> 32;
 	afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
-	afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+	afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME;
 
 	get_random_bytes(&clockseq, 2);
 	afs_uuid.clock_seq_low = clockseq;
 	afs_uuid.clock_seq_hi_and_reserved =
 		(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
-	afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+	afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD;
 
 	_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 	       afs_uuid.time_low,
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index aa59184151d..9682c33d5da 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -22,7 +22,7 @@
 
 static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
-				       struct nameidata *nd);
+				       unsigned int flags);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
@@ -104,7 +104,7 @@ out:
  */
 static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
-				       struct nameidata *nd)
+				       unsigned int flags)
 {
 	_enter("%p,%p{%p{%s},%s}",
 	       dir,
@@ -200,9 +200,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		if (PageError(page))
 			goto error;
 
-		buf = kmap_atomic(page, KM_USER0);
+		buf = kmap_atomic(page);
 		memcpy(devname, buf, size);
-		kunmap_atomic(buf, KM_USER0);
+		kunmap_atomic(buf);
 		page_cache_release(page);
 		page = NULL;
 	}
@@ -242,7 +242,7 @@ struct vfsmount *afs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
 
-	_enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
+	_enter("{%s}", path->dentry->d_name.name);
 
 	newmnt = afs_mntpt_do_automount(path->dentry);
 	if (IS_ERR(newmnt))
@@ -252,7 +252,7 @@ struct vfsmount *afs_d_automount(struct path *path)
 	mnt_set_expiry(newmnt, &afs_vfsmounts);
 	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
 			   afs_mntpt_expiry_timeout * HZ);
-	_leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+	_leave(" = %p", newmnt);
 	return newmnt;
 }
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 096b23f821a..24a905b076f 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = {
 	.write		= afs_proc_cells_write,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
-	.owner		= THIS_MODULE,
 };
 
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file);
 static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
 				      size_t size, loff_t *_pos);
 static ssize_t afs_proc_rootcell_write(struct file *file,
@@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       size_t size, loff_t *_pos);
 
 static const struct file_operations afs_proc_rootcell_fops = {
-	.open		= afs_proc_rootcell_open,
 	.read		= afs_proc_rootcell_read,
 	.write		= afs_proc_rootcell_write,
 	.llseek		= no_llseek,
-	.release	= afs_proc_rootcell_release,
-	.owner		= THIS_MODULE,
 };
 
 static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_volumes_release(struct inode *inode,
-					 struct file *file);
 static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 					loff_t *pos);
@@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = {
 	.open		= afs_proc_cell_volumes_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= afs_proc_cell_volumes_release,
-	.owner		= THIS_MODULE,
+	.release	= seq_release,
 };
 
 static int afs_proc_cell_vlservers_open(struct inode *inode,
 					struct file *file);
-static int afs_proc_cell_vlservers_release(struct inode *inode,
-					   struct file *file);
 static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *pos);
@@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.open		= afs_proc_cell_vlservers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= afs_proc_cell_vlservers_release,
-	.owner		= THIS_MODULE,
+	.release	= seq_release,
 };
 
 static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_servers_release(struct inode *inode,
-					 struct file *file);
 static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
 static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
 					loff_t *pos);
@@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 	.open		= afs_proc_cell_servers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= afs_proc_cell_servers_release,
-	.owner		= THIS_MODULE,
+	.release	= seq_release,
 };
 
 /*
@@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = {
  */
 int afs_proc_init(void)
 {
-	struct proc_dir_entry *p;
-
 	_enter("");
 
 	proc_afs = proc_mkdir("fs/afs", NULL);
 	if (!proc_afs)
 		goto error_dir;
 
-	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
-	if (!p)
-		goto error_cells;
-
-	p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
-	if (!p)
-		goto error_rootcell;
+	if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
+		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
-error_rootcell:
- 	remove_proc_entry("cells", proc_afs);
-error_cells:
-	remove_proc_entry("fs/afs", NULL);
+error_tree:
+	remove_proc_subtree("fs/afs", NULL);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -172,9 +149,7 @@ error_dir:
  */
 void afs_proc_cleanup(void)
 {
-	remove_proc_entry("rootcell", proc_afs);
-	remove_proc_entry("cells", proc_afs);
-	remove_proc_entry("fs/afs", NULL);
+	remove_proc_subtree("fs/afs", NULL);
 }
 
 /*
@@ -190,7 +165,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 		return ret;
 
 	m = file->private_data;
-	m->private = PDE(inode)->data;
+	m->private = PDE_DATA(inode);
 
 	return 0;
 }
@@ -319,19 +294,6 @@ inval:
 	goto done;
 }
 
-/*
- * Stubs for /proc/fs/afs/rootcell
- */
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
 static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
 				      size_t size, loff_t *_pos)
 {
@@ -387,38 +349,27 @@ nomem:
  */
 int afs_proc_cell_setup(struct afs_cell *cell)
 {
-	struct proc_dir_entry *p;
+	struct proc_dir_entry *dir;
 
 	_enter("%p{%s}", cell, cell->name);
 
-	cell->proc_dir = proc_mkdir(cell->name, proc_afs);
-	if (!cell->proc_dir)
+	dir = proc_mkdir(cell->name, proc_afs);
+	if (!dir)
 		goto error_dir;
 
-	p = proc_create_data("servers", 0, cell->proc_dir,
-			     &afs_proc_cell_servers_fops, cell);
-	if (!p)
-		goto error_servers;
-
-	p = proc_create_data("vlservers", 0, cell->proc_dir,
-			     &afs_proc_cell_vlservers_fops, cell);
-	if (!p)
-		goto error_vlservers;
-
-	p = proc_create_data("volumes", 0, cell->proc_dir,
-			     &afs_proc_cell_volumes_fops, cell);
-	if (!p)
-		goto error_volumes;
+	if (!proc_create_data("servers", 0, dir,
+			     &afs_proc_cell_servers_fops, cell) ||
+	    !proc_create_data("vlservers", 0, dir,
+			     &afs_proc_cell_vlservers_fops, cell) ||
+	    !proc_create_data("volumes", 0, dir,
+			     &afs_proc_cell_volumes_fops, cell))
+		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
-error_volumes:
-	remove_proc_entry("vlservers", cell->proc_dir);
-error_vlservers:
-	remove_proc_entry("servers", cell->proc_dir);
-error_servers:
-	remove_proc_entry(cell->name, proc_afs);
+error_tree:
+	remove_proc_subtree(cell->name, proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell)
 {
 	_enter("");
 
-	remove_proc_entry("volumes", cell->proc_dir);
-	remove_proc_entry("vlservers", cell->proc_dir);
-	remove_proc_entry("servers", cell->proc_dir);
-	remove_proc_entry(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, proc_afs);
 
 	_leave("");
 }
@@ -448,7 +396,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 }
 
 /*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
-{
-	return seq_release(inode, file);
-}
-
-/*
  * set up the iterator to start reading from the cells list and return the
  * first item
  */
@@ -554,7 +494,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 }
 
 /*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_vlservers_release(struct inode *inode,
-					   struct file *file)
-{
-	return seq_release(inode, file);
-}
-
-/*
  * set up the iterator to start reading from the cells list and return the
  * first item
  */
@@ -659,7 +590,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 }
 
 /*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_servers_release(struct inode *inode,
-					 struct file *file)
-{
-	return seq_release(inode, file);
-}
-
-/*
  * set up the iterator to start reading from the cells list and return the
  * first item
  */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index e45a323aebb..03a3beb1700 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *);
 static int afs_wait_for_call_to_complete(struct afs_call *);
 static void afs_wake_up_async_call(struct afs_call *);
 static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct work_struct *);
+static void afs_process_async_call(struct afs_call *);
 static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
 static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
 
@@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *);
 static struct sk_buff_head afs_incoming_calls;
 static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
 
+static void afs_async_workfn(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+	call->async_workfn(call);
+}
+
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call)
 }
 
 /*
+ * End a call but do not free it
+ */
+static void afs_end_call_nofree(struct afs_call *call)
+{
+	if (call->rxcall) {
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+	}
+	if (call->type->destructor)
+		call->type->destructor(call);
+}
+
+/*
+ * End a call and free it
+ */
+static void afs_end_call(struct afs_call *call)
+{
+	afs_end_call_nofree(call);
+	afs_free_call(call);
+}
+
+/*
  * allocate a call with flat request and reply buffers
  */
 struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
@@ -314,6 +343,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	struct msghdr msg;
 	struct kvec iov[1];
 	int ret;
+	struct sk_buff *skb;
 
 	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
 
@@ -325,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	       atomic_read(&afs_outstanding_calls));
 
 	call->wait_mode = wait_mode;
-	INIT_WORK(&call->async_work, afs_process_async_call);
+	call->async_workfn = afs_process_async_call;
+	INIT_WORK(&call->async_work, afs_async_workfn);
 
 	memset(&srx, 0, sizeof(srx));
 	srx.srx_family = AF_RXRPC;
@@ -380,11 +411,10 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 
 error_do_abort:
 	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
-	rxrpc_kernel_end_call(rxcall);
-	call->rxcall = NULL;
+	while ((skb = skb_dequeue(&call->rx_queue)))
+		afs_free_skb(skb);
 error_kill_call:
-	call->type->destructor(call);
-	afs_free_call(call);
+	afs_end_call(call);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -506,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
 	if (call->state >= AFS_CALL_COMPLETE) {
 		while ((skb = skb_dequeue(&call->rx_queue)))
 			afs_free_skb(skb);
-		if (call->incoming) {
-			rxrpc_kernel_end_call(call->rxcall);
-			call->rxcall = NULL;
-			call->type->destructor(call);
-			afs_free_call(call);
-		}
+		if (call->incoming)
+			afs_end_call(call);
 	}
 
 	_leave("");
@@ -561,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	}
 
 	_debug("call complete");
-	rxrpc_kernel_end_call(call->rxcall);
-	call->rxcall = NULL;
-	call->type->destructor(call);
-	afs_free_call(call);
+	afs_end_call(call);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -600,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
 /*
  * delete an asynchronous call
  */
-static void afs_delete_async_call(struct work_struct *work)
+static void afs_delete_async_call(struct afs_call *call)
 {
-	struct afs_call *call =
-		container_of(work, struct afs_call, async_work);
-
 	_enter("");
 
 	afs_free_call(call);
@@ -617,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work)
  * - on a multiple-thread workqueue this work item may try to run on several
  *   CPUs at the same time
  */
-static void afs_process_async_call(struct work_struct *work)
+static void afs_process_async_call(struct afs_call *call)
 {
-	struct afs_call *call =
-		container_of(work, struct afs_call, async_work);
-
 	_enter("");
 
 	if (!skb_queue_empty(&call->rx_queue))
@@ -634,14 +651,11 @@ static void afs_process_async_call(struct work_struct *work)
 		call->reply = NULL;
 
 		/* kill the call */
-		rxrpc_kernel_end_call(call->rxcall);
-		call->rxcall = NULL;
-		if (call->type->destructor)
-			call->type->destructor(call);
+		afs_end_call_nofree(call);
 
 		/* we can't just delete the call because the work item may be
 		 * queued */
-		PREPARE_WORK(&call->async_work, afs_delete_async_call);
+		call->async_workfn = afs_delete_async_call;
 		queue_work(afs_async_calls, &call->async_work);
 	}
 
@@ -682,7 +696,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
 				return;
 			}
 
-			INIT_WORK(&call->async_work, afs_process_async_call);
+			call->async_workfn = afs_process_async_call;
+			INIT_WORK(&call->async_work, afs_async_workfn);
 			call->wait_mode = &afs_async_incoming_call;
 			call->type = &afs_RXCMxxxx;
 			init_waitqueue_head(&call->waitq);
@@ -779,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
 		_debug("oom");
 		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
 	default:
-		rxrpc_kernel_end_call(call->rxcall);
-		call->rxcall = NULL;
-		call->type->destructor(call);
-		afs_free_call(call);
+		afs_end_call(call);
 		_leave(" [error]");
 		return;
 	}
@@ -812,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	call->state = AFS_CALL_AWAIT_ACK;
 	n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
 	if (n >= 0) {
+		/* Success */
 		_leave(" [replied]");
 		return;
 	}
+
 	if (n == -ENOMEM) {
 		_debug("oom");
 		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
 	}
-	rxrpc_kernel_end_call(call->rxcall);
-	call->rxcall = NULL;
-	call->type->destructor(call);
-	afs_free_call(call);
+	afs_end_call(call);
 	_leave(" [error]");
 }
 
diff --git a/fs/afs/security.c b/fs/afs/security.c
index f44b9d35537..8d010422dc8 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,14 +285,14 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask, unsigned int flags)
+int afs_permission(struct inode *inode, int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
 	struct key *key;
 	int ret;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	_enter("{{%x:%u},%lx},%x,",
@@ -350,7 +350,7 @@ int afs_permission(struct inode *inode, int mask, unsigned int flags)
 	}
 
 	key_put(key);
-	ret = generic_permission(inode, mask, flags, NULL);
+	ret = generic_permission(inode, mask);
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index d59b7516e94..f342acf3547 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -285,12 +285,7 @@ static void afs_reap_server(struct work_struct *work)
 		expiry = server->time_of_death + afs_server_timeout;
 		if (expiry > now) {
 			delay = (expiry - now) * HZ;
-			if (!queue_delayed_work(afs_wq, &afs_server_reaper,
-						delay)) {
-				cancel_delayed_work(&afs_server_reaper);
-				queue_delayed_work(afs_wq, &afs_server_reaper,
-						   delay);
-			}
+			mod_delayed_work(afs_wq, &afs_server_reaper, delay);
 			break;
 		}
 
@@ -323,6 +318,5 @@ static void afs_reap_server(struct work_struct *work)
 void __exit afs_purge_servers(void)
 {
 	afs_server_timeout = 0;
-	cancel_delayed_work(&afs_server_reaper);
-	queue_delayed_work(afs_wq, &afs_server_reaper, 0);
+	mod_delayed_work(afs_wq, &afs_server_reaper, 0);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fb240e8766d..c4861557e38 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -24,6 +24,8 @@
 #include <linux/parser.h>
 #include <linux/statfs.h>
 #include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
 #include "internal.h"
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
@@ -31,8 +33,8 @@
 static void afs_i_init_once(void *foo);
 static struct dentry *afs_mount(struct file_system_type *fs_type,
 		      int flags, const char *dev_name, void *data);
+static void afs_kill_super(struct super_block *sb);
 static struct inode *afs_alloc_inode(struct super_block *sb);
-static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
 static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
@@ -40,9 +42,10 @@ struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
 	.mount		= afs_mount,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= afs_kill_super,
 	.fs_flags	= 0,
 };
+MODULE_ALIAS_FS("afs");
 
 static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
@@ -50,7 +53,6 @@ static const struct super_operations afs_super_ops = {
 	.drop_inode	= afs_drop_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.evict_inode	= afs_evict_inode,
-	.put_super	= afs_put_super,
 	.show_options	= generic_show_options,
 };
 
@@ -124,6 +126,11 @@ void __exit afs_fs_exit(void)
 		BUG();
 	}
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(afs_inode_cachep);
 	_leave("");
 }
@@ -282,43 +289,38 @@ static int afs_parse_device_name(struct afs_mount_params *params,
  */
 static int afs_test_super(struct super_block *sb, void *data)
 {
-	struct afs_mount_params *params = data;
+	struct afs_super_info *as1 = data;
 	struct afs_super_info *as = sb->s_fs_info;
 
-	return as->volume == params->volume;
+	return as->volume == as1->volume;
+}
+
+static int afs_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
 }
 
 /*
  * fill in the superblock
  */
-static int afs_fill_super(struct super_block *sb, void *data)
+static int afs_fill_super(struct super_block *sb,
+			  struct afs_mount_params *params)
 {
-	struct afs_mount_params *params = data;
-	struct afs_super_info *as = NULL;
+	struct afs_super_info *as = sb->s_fs_info;
 	struct afs_fid fid;
-	struct dentry *root = NULL;
 	struct inode *inode = NULL;
 	int ret;
 
 	_enter("");
 
-	/* allocate a superblock info record */
-	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
-	if (!as) {
-		_leave(" = -ENOMEM");
-		return -ENOMEM;
-	}
-
-	afs_get_volume(params->volume);
-	as->volume = params->volume;
-
 	/* fill in the superblock */
 	sb->s_blocksize		= PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
 	sb->s_magic		= AFS_FS_MAGIC;
 	sb->s_op		= &afs_super_ops;
-	sb->s_fs_info		= as;
 	sb->s_bdi		= &as->volume->bdi;
+	strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
 
 	/* allocate the root inode and dentry */
 	fid.vid		= as->volume->vid;
@@ -326,32 +328,22 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	fid.unique	= 1;
 	inode = afs_iget(sb, params->key, &fid, NULL, NULL);
 	if (IS_ERR(inode))
-		goto error_inode;
+		return PTR_ERR(inode);
 
 	if (params->autocell)
 		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
 
 	ret = -ENOMEM;
-	root = d_alloc_root(inode);
-	if (!root)
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
 		goto error;
 
 	sb->s_d_op = &afs_fs_dentry_operations;
-	sb->s_root = root;
 
 	_leave(" = 0");
 	return 0;
 
-error_inode:
-	ret = PTR_ERR(inode);
-	inode = NULL;
 error:
-	iput(inode);
-	afs_put_volume(as->volume);
-	kfree(as);
-
-	sb->s_fs_info = NULL;
-
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -367,12 +359,17 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	struct afs_volume *vol;
 	struct key *key;
 	char *new_opts = kstrdup(options, GFP_KERNEL);
+	struct afs_super_info *as;
 	int ret;
 
 	_enter(",,%s,%p", dev_name, options);
 
 	memset(&params, 0, sizeof(params));
 
+	ret = -EINVAL;
+	if (current->nsproxy->net_ns != &init_net)
+		goto error;
+
 	/* parse the options and device name */
 	if (options) {
 		ret = afs_parse_options(&params, options, &dev_name);
@@ -399,19 +396,28 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 		ret = PTR_ERR(vol);
 		goto error;
 	}
-	params.volume = vol;
+
+	/* allocate a superblock info record */
+	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+	if (!as) {
+		ret = -ENOMEM;
+		afs_put_volume(vol);
+		goto error;
+	}
+	as->volume = vol;
 
 	/* allocate a deviceless superblock */
-	sb = sget(fs_type, afs_test_super, set_anon_super, &params);
+	sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
+		afs_put_volume(vol);
+		kfree(as);
 		goto error;
 	}
 
 	if (!sb->s_root) {
 		/* initial superblock/root creation */
 		_debug("create");
-		sb->s_flags = flags;
 		ret = afs_fill_super(sb, &params);
 		if (ret < 0) {
 			deactivate_locked_super(sb);
@@ -422,16 +428,16 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	} else {
 		_debug("reuse");
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+		afs_put_volume(vol);
+		kfree(as);
 	}
 
-	afs_put_volume(params.volume);
 	afs_put_cell(params.cell);
 	kfree(new_opts);
 	_leave(" = 0 [%p]", sb);
 	return dget(sb->s_root);
 
 error:
-	afs_put_volume(params.volume);
 	afs_put_cell(params.cell);
 	key_put(params.key);
 	kfree(new_opts);
@@ -439,18 +445,12 @@ error:
 	return ERR_PTR(ret);
 }
 
-/*
- * finish the unmounting process on the superblock
- */
-static void afs_put_super(struct super_block *sb)
+static void afs_kill_super(struct super_block *sb)
 {
 	struct afs_super_info *as = sb->s_fs_info;
-
-	_enter("");
-
+	kill_anon_super(sb);
 	afs_put_volume(as->volume);
-
-	_leave("");
+	kfree(as);
 }
 
 /*
@@ -503,7 +503,6 @@ static void afs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(afs_inode_cachep, vnode);
 }
 
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 431984d2e37..b6df2e83809 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 	/* see if we have an in-cache copy (will set vl->valid if there is) */
 #ifdef CONFIG_AFS_FSCACHE
 	vl->cache = fscache_acquire_cookie(vl->cell->cache,
-					   &afs_vlocation_cache_index_def, vl);
+					   &afs_vlocation_cache_index_def, vl,
+					   true);
 #endif
 
 	if (vl->valid) {
@@ -561,12 +562,7 @@ static void afs_vlocation_reaper(struct work_struct *work)
 		if (expiry > now) {
 			delay = (expiry - now) * HZ;
 			_debug("delay %lu", delay);
-			if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
-						delay)) {
-				cancel_delayed_work(&afs_vlocation_reap);
-				queue_delayed_work(afs_wq, &afs_vlocation_reap,
-						   delay);
-			}
+			mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
 			break;
 		}
 
@@ -614,13 +610,10 @@ void afs_vlocation_purge(void)
 	spin_lock(&afs_vlocation_updates_lock);
 	list_del_init(&afs_vlocation_updates);
 	spin_unlock(&afs_vlocation_updates_lock);
-	cancel_delayed_work(&afs_vlocation_update);
-	queue_delayed_work(afs_vlocation_update_worker,
-			   &afs_vlocation_update, 0);
+	mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
 	destroy_workqueue(afs_vlocation_update_worker);
 
-	cancel_delayed_work(&afs_vlocation_reap);
-	queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
+	mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
 }
 
 /*
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 401eeb21869..2b607257820 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 #ifdef CONFIG_AFS_FSCACHE
 	volume->cache = fscache_acquire_cookie(vlocation->cache,
 					       &afs_volume_cache_index_def,
-					       volume);
+					       volume, true);
 #endif
 	afs_get_vlocation(vlocation);
 	volume->vlocation = vlocation;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 789b3afb342..ab6adfd5251 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/aio.h>
 #include "internal.h"
 
 static int afs_write_back_from_locked_page(struct afs_writeback *wb,
@@ -84,23 +85,21 @@ void afs_put_writeback(struct afs_writeback *wb)
  * partly or wholly fill a page that's under preparation for writing
  */
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
-			 loff_t pos, unsigned len, struct page *page)
+			 loff_t pos, struct page *page)
 {
 	loff_t i_size;
-	unsigned eof;
 	int ret;
+	int len;
 
-	_enter(",,%llu,%u", (unsigned long long)pos, len);
-
-	ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
+	_enter(",,%llu", (unsigned long long)pos);
 
 	i_size = i_size_read(&vnode->vfs_inode);
-	if (pos + len > i_size)
-		eof = i_size;
+	if (pos + PAGE_CACHE_SIZE > i_size)
+		len = i_size - pos;
 	else
-		eof = PAGE_CACHE_SIZE;
+		len = PAGE_CACHE_SIZE;
 
-	ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
+	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
 			_debug("got NOENT from server"
@@ -122,7 +121,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 		    struct page **pagep, void **fsdata)
 {
 	struct afs_writeback *candidate, *wb;
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	struct page *page;
 	struct key *key = file->private_data;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
@@ -153,9 +152,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = page;
 	/* page won't leak in error case: it eventually gets cleaned off LRU */
 
-	if (!PageUptodate(page)) {
-		_debug("not up to date");
-		ret = afs_fill_page(vnode, key, pos, len, page);
+	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
+		ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
 		if (ret < 0) {
 			kfree(candidate);
 			_leave(" = %d [prep]", ret);
@@ -248,7 +246,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 		  loff_t pos, unsigned len, unsigned copied,
 		  struct page *page, void *fsdata)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	loff_t i_size, maybe_i_size;
 
 	_enter("{%x:%u},{%lx}",
@@ -627,16 +625,14 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 /*
  * write to an AFS file
  */
-ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
-		       unsigned long nr_segs, loff_t pos)
+ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct dentry *dentry = iocb->ki_filp->f_path.dentry;
-	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
 	ssize_t result;
-	size_t count = iov_length(iov, nr_segs);
+	size_t count = iov_iter_count(from);
 
-	_enter("{%x.%u},{%zu},%lu,",
-	       vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
+	_enter("{%x.%u},{%zu},",
+	       vnode->fid.vid, vnode->fid.vnode, count);
 
 	if (IS_SWAPFILE(&vnode->vfs_inode)) {
 		printk(KERN_INFO
@@ -647,7 +643,7 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		return 0;
 
-	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	result = generic_file_write_iter(iocb, from);
 	if (IS_ERR_VALUE(result)) {
 		_leave(" = %zd", result);
 		return result;
@@ -684,9 +680,10 @@ int afs_writeback_all(struct afs_vnode *vnode)
  * - the return status from this call provides a reliable indication of
  *   whether any write errors occurred for this process.
  */
-int afs_fsync(struct file *file, int datasync)
+int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = file->f_mapping->host;
 	struct afs_writeback *wb, *xwb;
 	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
 	int ret;
@@ -695,12 +692,19 @@ int afs_fsync(struct file *file, int datasync)
 	       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
 	       datasync);
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	/* use a writeback record as a marker in the queue - when this reaches
 	 * the front of the queue, all the outstanding writes are either
 	 * completed or rejected */
 	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
-	if (!wb)
-		return -ENOMEM;
+	if (!wb) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	wb->vnode = vnode;
 	wb->first = 0;
 	wb->last = -1;
@@ -723,7 +727,7 @@ int afs_fsync(struct file *file, int datasync)
 	if (ret < 0) {
 		afs_put_writeback(wb);
 		_leave(" = %d [wb]", ret);
-		return ret;
+		goto out;
 	}
 
 	/* wait for the preceding writes to actually complete */
@@ -732,6 +736,8 @@ int afs_fsync(struct file *file, int datasync)
 				       vnode->writebacks.next == &wb->link);
 	afs_put_writeback(wb);
 	_leave(" = %d", ret);
+out:
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/aio.c b/fs/aio.c
index ebb6a22e4e1..1c9c5f0a9e2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,24 +8,25 @@
  *
  *	See ../COPYING for licensing terms.
  */
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/aio_abi.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 
-#define DEBUG 0
-
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mmu_context.h>
+#include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
@@ -34,18 +35,120 @@
 #include <linux/security.h>
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/hash.h>
 #include <linux/compat.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>
+#include <linux/percpu-refcount.h>
+#include <linux/mount.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
 
-#if DEBUG > 1
-#define dprintk		printk
-#else
-#define dprintk(x...)	do { ; } while (0)
-#endif
+#include "internal.h"
+
+#define AIO_RING_MAGIC			0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES	1
+#define AIO_RING_INCOMPAT_FEATURES	0
+struct aio_ring {
+	unsigned	id;	/* kernel internal index number */
+	unsigned	nr;	/* number of io_events */
+	unsigned	head;	/* Written to by userland or under ring_lock
+				 * mutex by aio_read_events_ring(). */
+	unsigned	tail;
+
+	unsigned	magic;
+	unsigned	compat_features;
+	unsigned	incompat_features;
+	unsigned	header_length;	/* size of aio_ring */
+
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES	8
+
+struct kioctx_table {
+	struct rcu_head	rcu;
+	unsigned	nr;
+	struct kioctx	*table[];
+};
+
+struct kioctx_cpu {
+	unsigned		reqs_available;
+};
+
+struct kioctx {
+	struct percpu_ref	users;
+	atomic_t		dead;
+
+	struct percpu_ref	reqs;
+
+	unsigned long		user_id;
+
+	struct __percpu kioctx_cpu *cpu;
+
+	/*
+	 * For percpu reqs_available, number of slots we move to/from global
+	 * counter at a time:
+	 */
+	unsigned		req_batch;
+	/*
+	 * This is what userspace passed to io_setup(), it's not used for
+	 * anything but counting against the global max_reqs quota.
+	 *
+	 * The real limit is nr_events - 1, which will be larger (see
+	 * aio_setup_ring())
+	 */
+	unsigned		max_reqs;
+
+	/* Size of ringbuffer, in units of struct io_event */
+	unsigned		nr_events;
+
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+
+	struct page		**ring_pages;
+	long			nr_pages;
+
+	struct work_struct	free_work;
+
+	/*
+	 * signals when all in-flight requests are done
+	 */
+	struct completion *requests_done;
+
+	struct {
+		/*
+		 * This counts the number of available slots in the ringbuffer,
+		 * so we avoid overflowing it: it's decremented (if positive)
+		 * when allocating a kiocb and incremented when the resulting
+		 * io_event is pulled off the ringbuffer.
+		 *
+		 * We batch accesses to it with a percpu version.
+		 */
+		atomic_t	reqs_available;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t	ctx_lock;
+		struct list_head active_reqs;	/* used for cancellation */
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		struct mutex	ring_lock;
+		wait_queue_head_t wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		unsigned	tail;
+		spinlock_t	completion_lock;
+	} ____cacheline_aligned_in_smp;
+
+	struct page		*internal_pages[AIO_RING_PAGES];
+	struct file		*aio_ring_file;
+
+	unsigned		id;
+};
 
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
@@ -56,25 +159,51 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
-static struct workqueue_struct *aio_wq;
+static struct vfsmount *aio_mnt;
 
-/* Used for rare fput completion. */
-static void aio_fput_routine(struct work_struct *);
-static DECLARE_WORK(fput_work, aio_fput_routine);
+static const struct file_operations aio_ring_fops;
+static const struct address_space_operations aio_ctx_aops;
 
-static DEFINE_SPINLOCK(fput_lock);
-static LIST_HEAD(fput_head);
+static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
+{
+	struct qstr this = QSTR_INIT("[aio]", 5);
+	struct file *file;
+	struct path path;
+	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	inode->i_mapping->a_ops = &aio_ctx_aops;
+	inode->i_mapping->private_data = ctx;
+	inode->i_size = PAGE_SIZE * nr_pages;
+
+	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
+	if (!path.dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	path.mnt = mntget(aio_mnt);
 
-#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
-#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
-struct aio_batch_entry {
-	struct hlist_node list;
-	struct address_space *mapping;
-};
-mempool_t *abe_pool;
+	d_instantiate(path.dentry, inode);
+	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
+	if (IS_ERR(file)) {
+		path_put(&path);
+		return file;
+	}
 
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
+	file->f_flags = O_RDWR;
+	file->private_data = ctx;
+	return file;
+}
+
+static struct dentry *aio_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	static const struct dentry_operations ops = {
+		.d_dname	= simple_dname,
+	};
+	return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+}
 
 /* aio_setup
  *	Creates the slab caches used by the aio routines, panic on
@@ -82,179 +211,410 @@ static void aio_queue_work(struct kioctx *);
  */
 static int __init aio_setup(void)
 {
+	static struct file_system_type aio_fs = {
+		.name		= "aio",
+		.mount		= aio_mount,
+		.kill_sb	= kill_anon_super,
+	};
+	aio_mnt = kern_mount(&aio_fs);
+	if (IS_ERR(aio_mnt))
+		panic("Failed to create aio fs mount.");
+
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
-	aio_wq = alloc_workqueue("aio", 0, 1);	/* used to limit concurrency */
-	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-	BUG_ON(!aio_wq || !abe_pool);
-
-	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
 
 	return 0;
 }
 __initcall(aio_setup);
 
+static void put_aio_ring_file(struct kioctx *ctx)
+{
+	struct file *aio_ring_file = ctx->aio_ring_file;
+	if (aio_ring_file) {
+		truncate_setsize(aio_ring_file->f_inode, 0);
+
+		/* Prevent further access to the kioctx from migratepages */
+		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
+		aio_ring_file->f_inode->i_mapping->private_data = NULL;
+		ctx->aio_ring_file = NULL;
+		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
+
+		fput(aio_ring_file);
+	}
+}
+
 static void aio_free_ring(struct kioctx *ctx)
 {
-	struct aio_ring_info *info = &ctx->ring_info;
-	long i;
+	int i;
+
+	/* Disconnect the kiotx from the ring file.  This prevents future
+	 * accesses to the kioctx from page migration.
+	 */
+	put_aio_ring_file(ctx);
+
+	for (i = 0; i < ctx->nr_pages; i++) {
+		struct page *page;
+		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+				page_count(ctx->ring_pages[i]));
+		page = ctx->ring_pages[i];
+		if (!page)
+			continue;
+		ctx->ring_pages[i] = NULL;
+		put_page(page);
+	}
+
+	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
+		kfree(ctx->ring_pages);
+		ctx->ring_pages = NULL;
+	}
+}
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+static const struct file_operations aio_ring_fops = {
+	.mmap = aio_ring_mmap,
+};
+
+static int aio_set_page_dirty(struct page *page)
+{
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_MIGRATION)
+static int aio_migratepage(struct address_space *mapping, struct page *new,
+			struct page *old, enum migrate_mode mode)
+{
+	struct kioctx *ctx;
+	unsigned long flags;
+	pgoff_t idx;
+	int rc;
 
-	for (i=0; i<info->nr_pages; i++)
-		put_page(info->ring_pages[i]);
+	rc = 0;
 
-	if (info->mmap_size) {
-		down_write(&ctx->mm->mmap_sem);
-		do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
-		up_write(&ctx->mm->mmap_sem);
+	/* mapping->private_lock here protects against the kioctx teardown.  */
+	spin_lock(&mapping->private_lock);
+	ctx = mapping->private_data;
+	if (!ctx) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* The ring_lock mutex.  The prevents aio_read_events() from writing
+	 * to the ring's head, and prevents page migration from mucking in
+	 * a partially initialized kiotx.
+	 */
+	if (!mutex_trylock(&ctx->ring_lock)) {
+		rc = -EAGAIN;
+		goto out;
+	}
+
+	idx = old->index;
+	if (idx < (pgoff_t)ctx->nr_pages) {
+		/* Make sure the old page hasn't already been changed */
+		if (ctx->ring_pages[idx] != old)
+			rc = -EAGAIN;
+	} else
+		rc = -EINVAL;
+
+	if (rc != 0)
+		goto out_unlock;
+
+	/* Writeback must be complete */
+	BUG_ON(PageWriteback(old));
+	get_page(new);
+
+	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
+	if (rc != MIGRATEPAGE_SUCCESS) {
+		put_page(new);
+		goto out_unlock;
 	}
 
-	if (info->ring_pages && info->ring_pages != info->internal_pages)
-		kfree(info->ring_pages);
-	info->ring_pages = NULL;
-	info->nr = 0;
+	/* Take completion_lock to prevent other writes to the ring buffer
+	 * while the old page is copied to the new.  This prevents new
+	 * events from being lost.
+	 */
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	migrate_page_copy(new, old);
+	BUG_ON(ctx->ring_pages[idx] != old);
+	ctx->ring_pages[idx] = new;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	/* The old page is no longer accessible. */
+	put_page(old);
+
+out_unlock:
+	mutex_unlock(&ctx->ring_lock);
+out:
+	spin_unlock(&mapping->private_lock);
+	return rc;
 }
+#endif
+
+static const struct address_space_operations aio_ctx_aops = {
+	.set_page_dirty = aio_set_page_dirty,
+#if IS_ENABLED(CONFIG_MIGRATION)
+	.migratepage	= aio_migratepage,
+#endif
+};
 
 static int aio_setup_ring(struct kioctx *ctx)
 {
 	struct aio_ring *ring;
-	struct aio_ring_info *info = &ctx->ring_info;
 	unsigned nr_events = ctx->max_reqs;
-	unsigned long size;
+	struct mm_struct *mm = current->mm;
+	unsigned long size, unused;
 	int nr_pages;
+	int i;
+	struct file *file;
 
 	/* Compensate for the ring buffer's head/tail overlap entry */
 	nr_events += 2;	/* 1 is required, 2 for good luck */
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
-	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
 
+	nr_pages = PFN_UP(size);
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+	file = aio_private_file(ctx, nr_pages);
+	if (IS_ERR(file)) {
+		ctx->aio_ring_file = NULL;
+		return -ENOMEM;
+	}
 
-	info->nr = 0;
-	info->ring_pages = info->internal_pages;
+	ctx->aio_ring_file = file;
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+			/ sizeof(struct io_event);
+
+	ctx->ring_pages = ctx->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
-		info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-		if (!info->ring_pages)
+		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+					  GFP_KERNEL);
+		if (!ctx->ring_pages) {
+			put_aio_ring_file(ctx);
 			return -ENOMEM;
+		}
 	}
 
-	info->mmap_size = nr_pages * PAGE_SIZE;
-	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
-	down_write(&ctx->mm->mmap_sem);
-	info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
-				  PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
-				  0);
-	if (IS_ERR((void *)info->mmap_base)) {
-		up_write(&ctx->mm->mmap_sem);
-		info->mmap_size = 0;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+		page = find_or_create_page(file->f_inode->i_mapping,
+					   i, GFP_HIGHUSER | __GFP_ZERO);
+		if (!page)
+			break;
+		pr_debug("pid(%d) page[%d]->count=%d\n",
+			 current->pid, i, page_count(page));
+		SetPageUptodate(page);
+		SetPageDirty(page);
+		unlock_page(page);
+
+		ctx->ring_pages[i] = page;
+	}
+	ctx->nr_pages = i;
+
+	if (unlikely(i != nr_pages)) {
 		aio_free_ring(ctx);
-		return -EAGAIN;
+		return -ENOMEM;
 	}
 
-	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
-	info->nr_pages = get_user_pages(current, ctx->mm,
-					info->mmap_base, nr_pages, 
-					1, 0, info->ring_pages, NULL);
-	up_write(&ctx->mm->mmap_sem);
+	ctx->mmap_size = nr_pages * PAGE_SIZE;
+	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
 
-	if (unlikely(info->nr_pages != nr_pages)) {
+	down_write(&mm->mmap_sem);
+	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
+				       PROT_READ | PROT_WRITE,
+				       MAP_SHARED, 0, &unused);
+	up_write(&mm->mmap_sem);
+	if (IS_ERR((void *)ctx->mmap_base)) {
+		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
-		return -EAGAIN;
+		return -ENOMEM;
 	}
 
-	ctx->user_id = info->mmap_base;
+	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
 
-	info->nr = nr_events;		/* trusted copy */
+	ctx->user_id = ctx->mmap_base;
+	ctx->nr_events = nr_events; /* trusted copy */
 
-	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->nr = nr_events;	/* user copy */
-	ring->id = ctx->user_id;
+	ring->id = ~0U;
 	ring->head = ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
 	ring->header_length = sizeof(struct aio_ring);
-	kunmap_atomic(ring, KM_USER0);
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
 
 	return 0;
 }
 
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(, km).  Release the pointer with put_aio_ring_event();
- */
 #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({					\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
-	struct io_event *__event;					\
-	__event = kmap_atomic(						\
-			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
-	__event += pos % AIO_EVENTS_PER_PAGE;				\
-	__event;							\
-})
-
-#define put_aio_ring_event(event, km) do {	\
-	struct io_event *__event = (event);	\
-	(void)__event;				\
-	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
-} while(0)
-
-static void ctx_rcu_free(struct rcu_head *head)
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 {
-	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
-	unsigned nr_events = ctx->max_reqs;
+	struct kioctx *ctx = req->ki_ctx;
+	unsigned long flags;
 
-	kmem_cache_free(kioctx_cachep, ctx);
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
-	if (nr_events) {
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - nr_events > aio_nr);
-		aio_nr -= nr_events;
-		spin_unlock(&aio_nr_lock);
-	}
+	if (!req->ki_list.next)
+		list_add(&req->ki_list, &ctx->active_reqs);
+
+	req->ki_cancel = cancel;
+
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
-/* __put_ioctx
- *	Called when the last user of an aio context has gone away,
- *	and the struct needs to be freed.
- */
-static void __put_ioctx(struct kioctx *ctx)
+static int kiocb_cancel(struct kiocb *kiocb)
 {
-	BUG_ON(ctx->reqs_active);
+	kiocb_cancel_fn *old, *cancel;
+
+	/*
+	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+	 * actually has a cancel function, hence the cmpxchg()
+	 */
+
+	cancel = ACCESS_ONCE(kiocb->ki_cancel);
+	do {
+		if (!cancel || cancel == KIOCB_CANCELLED)
+			return -EINVAL;
+
+		old = cancel;
+		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+	} while (cancel != old);
+
+	return cancel(kiocb);
+}
+
+static void free_ioctx(struct work_struct *work)
+{
+	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+
+	pr_debug("freeing %p\n", ctx);
 
-	cancel_delayed_work(&ctx->wq);
-	cancel_work_sync(&ctx->wq.work);
 	aio_free_ring(ctx);
-	mmdrop(ctx->mm);
-	ctx->mm = NULL;
-	pr_debug("__put_ioctx: freeing %p\n", ctx);
-	call_rcu(&ctx->rcu_head, ctx_rcu_free);
+	free_percpu(ctx->cpu);
+	kmem_cache_free(kioctx_cachep, ctx);
+}
+
+static void free_ioctx_reqs(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+
+	/* At this point we know that there are no any in-flight requests */
+	if (ctx->requests_done)
+		complete(ctx->requests_done);
+
+	INIT_WORK(&ctx->free_work, free_ioctx);
+	schedule_work(&ctx->free_work);
 }
 
-static inline void get_ioctx(struct kioctx *kioctx)
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
+ */
+static void free_ioctx_users(struct percpu_ref *ref)
 {
-	BUG_ON(atomic_read(&kioctx->users) <= 0);
-	atomic_inc(&kioctx->users);
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+	struct kiocb *req;
+
+	spin_lock_irq(&ctx->ctx_lock);
+
+	while (!list_empty(&ctx->active_reqs)) {
+		req = list_first_entry(&ctx->active_reqs,
+				       struct kiocb, ki_list);
+
+		list_del_init(&req->ki_list);
+		kiocb_cancel(req);
+	}
+
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	percpu_ref_kill(&ctx->reqs);
+	percpu_ref_put(&ctx->reqs);
 }
 
-static inline int try_get_ioctx(struct kioctx *kioctx)
+static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
-	return atomic_inc_not_zero(&kioctx->users);
+	unsigned i, new_nr;
+	struct kioctx_table *table, *old;
+	struct aio_ring *ring;
+
+	spin_lock(&mm->ioctx_lock);
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
+
+	while (1) {
+		if (table)
+			for (i = 0; i < table->nr; i++)
+				if (!table->table[i]) {
+					ctx->id = i;
+					table->table[i] = ctx;
+					rcu_read_unlock();
+					spin_unlock(&mm->ioctx_lock);
+
+					/* While kioctx setup is in progress,
+					 * we are protected from page migration
+					 * changes ring_pages by ->ring_lock.
+					 */
+					ring = kmap_atomic(ctx->ring_pages[0]);
+					ring->id = ctx->id;
+					kunmap_atomic(ring);
+					return 0;
+				}
+
+		new_nr = (table ? table->nr : 1) * 4;
+
+		rcu_read_unlock();
+		spin_unlock(&mm->ioctx_lock);
+
+		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
+				new_nr, GFP_KERNEL);
+		if (!table)
+			return -ENOMEM;
+
+		table->nr = new_nr;
+
+		spin_lock(&mm->ioctx_lock);
+		rcu_read_lock();
+		old = rcu_dereference(mm->ioctx_table);
+
+		if (!old) {
+			rcu_assign_pointer(mm->ioctx_table, table);
+		} else if (table->nr > old->nr) {
+			memcpy(table->table, old->table,
+			       old->nr * sizeof(struct kioctx *));
+
+			rcu_assign_pointer(mm->ioctx_table, table);
+			kfree_rcu(old, rcu);
+		} else {
+			kfree(table);
+			table = old;
+		}
+	}
 }
 
-static inline void put_ioctx(struct kioctx *kioctx)
+static void aio_nr_sub(unsigned nr)
 {
-	BUG_ON(atomic_read(&kioctx->users) <= 0);
-	if (unlikely(atomic_dec_and_test(&kioctx->users)))
-		__put_ioctx(kioctx);
+	spin_lock(&aio_nr_lock);
+	if (WARN_ON(aio_nr - nr > aio_nr))
+		aio_nr = 0;
+	else
+		aio_nr -= nr;
+	spin_unlock(&aio_nr_lock);
 }
 
 /* ioctx_alloc
@@ -262,9 +622,21 @@ static inline void put_ioctx(struct kioctx *kioctx)
  */
 static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
-	struct mm_struct *mm;
+	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
-	int did_sync = 0;
+	int err = -ENOMEM;
+
+	/*
+	 * We keep track of the number of available ringbuffer slots, to prevent
+	 * overflow (reqs_available), and we also use percpu counters for this.
+	 *
+	 * So since up to half the slots might be on other cpu's percpu counters
+	 * and unavailable, double nr_events so userspace sees what they
+	 * expected: additionally, we move req_batch slots to/from percpu
+	 * counters at a time, so make sure that isn't 0:
+	 */
+	nr_events = max(nr_events, num_possible_cpus() * 4);
+	nr_events *= 2;
 
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -273,7 +645,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if ((unsigned long)nr_events > aio_max_nr)
+	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -281,639 +653,303 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = nr_events;
-	mm = ctx->mm = current->mm;
-	atomic_inc(&mm->mm_count);
 
-	atomic_set(&ctx->users, 1);
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->ring_info.ring_lock);
+	spin_lock_init(&ctx->completion_lock);
+	mutex_init(&ctx->ring_lock);
+	/* Protect against page migration throughout kiotx setup by keeping
+	 * the ring_lock mutex held until setup is complete. */
+	mutex_lock(&ctx->ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
-	INIT_LIST_HEAD(&ctx->run_list);
-	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
 
-	if (aio_setup_ring(ctx) < 0)
-		goto out_freectx;
+	if (percpu_ref_init(&ctx->users, free_ioctx_users))
+		goto err;
 
-	/* limit the number of system wide aios */
-	do {
-		spin_lock_bh(&aio_nr_lock);
-		if (aio_nr + nr_events > aio_max_nr ||
-		    aio_nr + nr_events < aio_nr)
-			ctx->max_reqs = 0;
-		else
-			aio_nr += ctx->max_reqs;
-		spin_unlock_bh(&aio_nr_lock);
-		if (ctx->max_reqs || did_sync)
-			break;
+	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+		goto err;
 
-		/* wait for rcu callbacks to have completed before giving up */
-		synchronize_rcu();
-		did_sync = 1;
-		ctx->max_reqs = nr_events;
-	} while (1);
+	ctx->cpu = alloc_percpu(struct kioctx_cpu);
+	if (!ctx->cpu)
+		goto err;
 
-	if (ctx->max_reqs == 0)
-		goto out_cleanup;
+	err = aio_setup_ring(ctx);
+	if (err < 0)
+		goto err;
 
-	/* now link into global list. */
-	spin_lock(&mm->ioctx_lock);
-	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
-	spin_unlock(&mm->ioctx_lock);
+	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
+	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
+	if (ctx->req_batch < 1)
+		ctx->req_batch = 1;
 
-	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
-		ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
-	return ctx;
+	/* limit the number of system wide aios */
+	spin_lock(&aio_nr_lock);
+	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
+	    aio_nr + nr_events < aio_nr) {
+		spin_unlock(&aio_nr_lock);
+		err = -EAGAIN;
+		goto err_ctx;
+	}
+	aio_nr += ctx->max_reqs;
+	spin_unlock(&aio_nr_lock);
 
-out_cleanup:
-	__put_ioctx(ctx);
-	return ERR_PTR(-EAGAIN);
+	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
+	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
 
-out_freectx:
-	mmdrop(mm);
-	kmem_cache_free(kioctx_cachep, ctx);
-	ctx = ERR_PTR(-ENOMEM);
+	err = ioctx_add_table(ctx, mm);
+	if (err)
+		goto err_cleanup;
+
+	/* Release the ring_lock mutex now that all setup is complete. */
+	mutex_unlock(&ctx->ring_lock);
 
-	dprintk("aio: error allocating ioctx %p\n", ctx);
+	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
+
+err_cleanup:
+	aio_nr_sub(ctx->max_reqs);
+err_ctx:
+	aio_free_ring(ctx);
+err:
+	mutex_unlock(&ctx->ring_lock);
+	free_percpu(ctx->cpu);
+	free_percpu(ctx->reqs.pcpu_count);
+	free_percpu(ctx->users.pcpu_count);
+	kmem_cache_free(kioctx_cachep, ctx);
+	pr_debug("error allocating ioctx %d\n", err);
+	return ERR_PTR(err);
 }
 
-/* aio_cancel_all
- *	Cancels all outstanding aio requests on an aio context.  Used 
- *	when the processes owning a context have all exited to encourage 
+/* kill_ioctx
+ *	Cancels all outstanding aio requests on an aio context.  Used
+ *	when the processes owning a context have all exited to encourage
  *	the rapid destruction of the kioctx.
  */
-static void aio_cancel_all(struct kioctx *ctx)
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+		struct completion *requests_done)
 {
-	int (*cancel)(struct kiocb *, struct io_event *);
-	struct io_event res;
-	spin_lock_irq(&ctx->ctx_lock);
-	ctx->dead = 1;
-	while (!list_empty(&ctx->active_reqs)) {
-		struct list_head *pos = ctx->active_reqs.next;
-		struct kiocb *iocb = list_kiocb(pos);
-		list_del_init(&iocb->ki_list);
-		cancel = iocb->ki_cancel;
-		kiocbSetCancelled(iocb);
-		if (cancel) {
-			iocb->ki_users++;
-			spin_unlock_irq(&ctx->ctx_lock);
-			cancel(iocb, &res);
-			spin_lock_irq(&ctx->ctx_lock);
-		}
-	}
-	spin_unlock_irq(&ctx->ctx_lock);
-}
+	struct kioctx_table *table;
 
-static void wait_for_all_aios(struct kioctx *ctx)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
+	if (atomic_xchg(&ctx->dead, 1))
+		return -EINVAL;
 
-	spin_lock_irq(&ctx->ctx_lock);
-	if (!ctx->reqs_active)
-		goto out;
 
-	add_wait_queue(&ctx->wait, &wait);
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	while (ctx->reqs_active) {
-		spin_unlock_irq(&ctx->ctx_lock);
-		io_schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		spin_lock_irq(&ctx->ctx_lock);
-	}
-	__set_task_state(tsk, TASK_RUNNING);
-	remove_wait_queue(&ctx->wait, &wait);
+	spin_lock(&mm->ioctx_lock);
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
 
-out:
-	spin_unlock_irq(&ctx->ctx_lock);
+	WARN_ON(ctx != table->table[ctx->id]);
+	table->table[ctx->id] = NULL;
+	rcu_read_unlock();
+	spin_unlock(&mm->ioctx_lock);
+
+	/* percpu_ref_kill() will do the necessary call_rcu() */
+	wake_up_all(&ctx->wait);
+
+	/*
+	 * It'd be more correct to do this in free_ioctx(), after all
+	 * the outstanding kiocbs have finished - but by then io_destroy
+	 * has already returned, so io_setup() could potentially return
+	 * -EAGAIN with no ioctxs actually in use (as far as userspace
+	 *  could tell).
+	 */
+	aio_nr_sub(ctx->max_reqs);
+
+	if (ctx->mmap_size)
+		vm_munmap(ctx->mmap_base, ctx->mmap_size);
+
+	ctx->requests_done = requests_done;
+	percpu_ref_kill(&ctx->users);
+	return 0;
 }
 
 /* wait_on_sync_kiocb:
  *	Waits on the given sync kiocb to complete.
  */
-ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
+ssize_t wait_on_sync_kiocb(struct kiocb *req)
 {
-	while (iocb->ki_users) {
+	while (!req->ki_ctx) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!iocb->ki_users)
+		if (req->ki_ctx)
 			break;
 		io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
-	return iocb->ki_user_data;
+	return req->ki_user_data;
 }
 EXPORT_SYMBOL(wait_on_sync_kiocb);
 
-/* exit_aio: called when the last user of mm goes away.  At this point, 
- * there is no way for any new requests to be submited or any of the 
- * io_* syscalls to be called on the context.  However, there may be 
- * outstanding requests which hold references to the context; as they 
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away.  At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
  */
 void exit_aio(struct mm_struct *mm)
 {
+	struct kioctx_table *table;
 	struct kioctx *ctx;
+	unsigned i = 0;
+
+	while (1) {
+		rcu_read_lock();
+		table = rcu_dereference(mm->ioctx_table);
+
+		do {
+			if (!table || i >= table->nr) {
+				rcu_read_unlock();
+				rcu_assign_pointer(mm->ioctx_table, NULL);
+				if (table)
+					kfree(table);
+				return;
+			}
 
-	while (!hlist_empty(&mm->ioctx_list)) {
-		ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
-		hlist_del_rcu(&ctx->list);
+			ctx = table->table[i++];
+		} while (!ctx);
 
-		aio_cancel_all(ctx);
+		rcu_read_unlock();
 
-		wait_for_all_aios(ctx);
 		/*
-		 * Ensure we don't leave the ctx on the aio_wq
+		 * We don't need to bother with munmap() here -
+		 * exit_mmap(mm) is coming and it'll unmap everything.
+		 * Since aio_free_ring() uses non-zero ->mmap_size
+		 * as indicator that it needs to unmap the area,
+		 * just set it to 0; aio_free_ring() is the only
+		 * place that uses ->mmap_size, so it's safe.
 		 */
-		cancel_work_sync(&ctx->wq.work);
-
-		if (1 != atomic_read(&ctx->users))
-			printk(KERN_DEBUG
-				"exit_aio:ioctx still alive: %d %d %d\n",
-				atomic_read(&ctx->users), ctx->dead,
-				ctx->reqs_active);
-		put_ioctx(ctx);
+		ctx->mmap_size = 0;
+
+		kill_ioctx(mm, ctx, NULL);
 	}
 }
 
-/* aio_get_req
- *	Allocate a slot for an aio request.  Increments the users count
- * of the kioctx so that the kioctx stays around until all requests are
- * complete.  Returns NULL if no requests are free.
- *
- * Returns with kiocb->users set to 2.  The io submit code path holds
- * an extra reference while submitting the i/o.
- * This prevents races between the aio code path referencing the
- * req (after submitting it) and aio_complete() freeing the req.
- */
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 {
-	struct kiocb *req = NULL;
-	struct aio_ring *ring;
-	int okay = 0;
+	struct kioctx_cpu *kcpu;
+	unsigned long flags;
 
-	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
-	if (unlikely(!req))
-		return NULL;
+	preempt_disable();
+	kcpu = this_cpu_ptr(ctx->cpu);
 
-	req->ki_flags = 0;
-	req->ki_users = 2;
-	req->ki_key = 0;
-	req->ki_ctx = ctx;
-	req->ki_cancel = NULL;
-	req->ki_retry = NULL;
-	req->ki_dtor = NULL;
-	req->private = NULL;
-	req->ki_iovec = NULL;
-	INIT_LIST_HEAD(&req->ki_run_list);
-	req->ki_eventfd = NULL;
-
-	/* Check if the completion queue has enough free space to
-	 * accept an event from this io.
-	 */
-	spin_lock_irq(&ctx->ctx_lock);
-	ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
-	if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
-		list_add(&req->ki_list, &ctx->active_reqs);
-		ctx->reqs_active++;
-		okay = 1;
-	}
-	kunmap_atomic(ring, KM_USER0);
-	spin_unlock_irq(&ctx->ctx_lock);
+	local_irq_save(flags);
+	kcpu->reqs_available += nr;
 
-	if (!okay) {
-		kmem_cache_free(kiocb_cachep, req);
-		req = NULL;
+	while (kcpu->reqs_available >= ctx->req_batch * 2) {
+		kcpu->reqs_available -= ctx->req_batch;
+		atomic_add(ctx->req_batch, &ctx->reqs_available);
 	}
 
-	return req;
-}
-
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
-{
-	struct kiocb *req;
-	/* Handle a potential starvation case -- should be exceedingly rare as 
-	 * requests will be stuck on fput_head only if the aio_fput_routine is 
-	 * delayed and the requests were the last user of the struct file.
-	 */
-	req = __aio_get_req(ctx);
-	if (unlikely(NULL == req)) {
-		aio_fput_routine(NULL);
-		req = __aio_get_req(ctx);
-	}
-	return req;
+	local_irq_restore(flags);
+	preempt_enable();
 }
 
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static bool get_reqs_available(struct kioctx *ctx)
 {
-	assert_spin_locked(&ctx->ctx_lock);
-
-	if (req->ki_eventfd != NULL)
-		eventfd_ctx_put(req->ki_eventfd);
-	if (req->ki_dtor)
-		req->ki_dtor(req);
-	if (req->ki_iovec != &req->ki_inline_vec)
-		kfree(req->ki_iovec);
-	kmem_cache_free(kiocb_cachep, req);
-	ctx->reqs_active--;
+	struct kioctx_cpu *kcpu;
+	bool ret = false;
+	unsigned long flags;
 
-	if (unlikely(!ctx->reqs_active && ctx->dead))
-		wake_up_all(&ctx->wait);
-}
+	preempt_disable();
+	kcpu = this_cpu_ptr(ctx->cpu);
 
-static void aio_fput_routine(struct work_struct *data)
-{
-	spin_lock_irq(&fput_lock);
-	while (likely(!list_empty(&fput_head))) {
-		struct kiocb *req = list_kiocb(fput_head.next);
-		struct kioctx *ctx = req->ki_ctx;
+	local_irq_save(flags);
+	if (!kcpu->reqs_available) {
+		int old, avail = atomic_read(&ctx->reqs_available);
 
-		list_del(&req->ki_list);
-		spin_unlock_irq(&fput_lock);
-
-		/* Complete the fput(s) */
-		if (req->ki_filp != NULL)
-			fput(req->ki_filp);
+		do {
+			if (avail < ctx->req_batch)
+				goto out;
 
-		/* Link the iocb into the context's free list */
-		spin_lock_irq(&ctx->ctx_lock);
-		really_put_req(ctx, req);
-		spin_unlock_irq(&ctx->ctx_lock);
+			old = avail;
+			avail = atomic_cmpxchg(&ctx->reqs_available,
+					       avail, avail - ctx->req_batch);
+		} while (avail != old);
 
-		put_ioctx(ctx);
-		spin_lock_irq(&fput_lock);
+		kcpu->reqs_available += ctx->req_batch;
 	}
-	spin_unlock_irq(&fput_lock);
+
+	ret = true;
+	kcpu->reqs_available--;
+out:
+	local_irq_restore(flags);
+	preempt_enable();
+	return ret;
 }
 
-/* __aio_put_req
- *	Returns true if this put was the last user of the request.
+/* aio_get_req
+ *	Allocate a slot for an aio request.
+ * Returns NULL if no requests are free.
  */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
-	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
-		req, atomic_long_read(&req->ki_filp->f_count));
+	struct kiocb *req;
 
-	assert_spin_locked(&ctx->ctx_lock);
+	if (!get_reqs_available(ctx))
+		return NULL;
 
-	req->ki_users--;
-	BUG_ON(req->ki_users < 0);
-	if (likely(req->ki_users))
-		return 0;
-	list_del(&req->ki_list);		/* remove from active_reqs */
-	req->ki_cancel = NULL;
-	req->ki_retry = NULL;
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+	if (unlikely(!req))
+		goto out_put;
 
-	/*
-	 * Try to optimize the aio and eventfd file* puts, by avoiding to
-	 * schedule work in case it is not final fput() time. In normal cases,
-	 * we would not be holding the last reference to the file*, so
-	 * this function will be executed w/out any aio kthread wakeup.
-	 */
-	if (unlikely(!fput_atomic(req->ki_filp))) {
-		get_ioctx(ctx);
-		spin_lock(&fput_lock);
-		list_add(&req->ki_list, &fput_head);
-		spin_unlock(&fput_lock);
-		schedule_work(&fput_work);
-	} else {
-		req->ki_filp = NULL;
-		really_put_req(ctx, req);
-	}
-	return 1;
+	percpu_ref_get(&ctx->reqs);
+
+	req->ki_ctx = ctx;
+	return req;
+out_put:
+	put_reqs_available(ctx, 1);
+	return NULL;
 }
 
-/* aio_put_req
- *	Returns true if this put was the last user of the kiocb,
- *	false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
 {
-	struct kioctx *ctx = req->ki_ctx;
-	int ret;
-	spin_lock_irq(&ctx->ctx_lock);
-	ret = __aio_put_req(ctx, req);
-	spin_unlock_irq(&ctx->ctx_lock);
-	return ret;
+	if (req->ki_filp)
+		fput(req->ki_filp);
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
+	kmem_cache_free(kiocb_cachep, req);
 }
-EXPORT_SYMBOL(aio_put_req);
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
+	struct aio_ring __user *ring  = (void __user *)ctx_id;
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
-	struct hlist_node *n;
-
-	rcu_read_lock();
-
-	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-		/*
-		 * RCU protects us against accessing freed memory but
-		 * we have to be careful not to get a reference when the
-		 * reference count already dropped to 0 (ctx->dead test
-		 * is unreliable because of races).
-		 */
-		if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
-			ret = ctx;
-			break;
-		}
-	}
-
-	rcu_read_unlock();
-	return ret;
-}
-
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
-{
-	struct kioctx *ctx = iocb->ki_ctx;
-
-	assert_spin_locked(&ctx->ctx_lock);
+	struct kioctx_table *table;
+	unsigned id;
 
-	if (list_empty(&iocb->ki_run_list)) {
-		list_add_tail(&iocb->ki_run_list,
-			&ctx->run_list);
-		return 1;
-	}
-	return 0;
-}
-
-/* aio_run_iocb
- *	This is the core aio execution routine. It is
- *	invoked both for initial i/o submission and
- *	subsequent retries via the aio_kick_handler.
- *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reacquired
- *	as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
-	struct kioctx	*ctx = iocb->ki_ctx;
-	ssize_t (*retry)(struct kiocb *);
-	ssize_t ret;
-
-	if (!(retry = iocb->ki_retry)) {
-		printk("aio_run_iocb: iocb->ki_retry = NULL\n");
-		return 0;
-	}
-
-	/*
-	 * We don't want the next retry iteration for this
-	 * operation to start until this one has returned and
-	 * updated the iocb state. However, wait_queue functions
-	 * can trigger a kick_iocb from interrupt context in the
-	 * meantime, indicating that data is available for the next
-	 * iteration. We want to remember that and enable the
-	 * next retry iteration _after_ we are through with
-	 * this one.
-	 *
-	 * So, in order to be able to register a "kick", but
-	 * prevent it from being queued now, we clear the kick
-	 * flag, but make the kick code *think* that the iocb is
-	 * still on the run list until we are actually done.
-	 * When we are done with this iteration, we check if
-	 * the iocb was kicked in the meantime and if so, queue
-	 * it up afresh.
-	 */
-
-	kiocbClearKicked(iocb);
+	if (get_user(id, &ring->id))
+		return NULL;
 
-	/*
-	 * This is so that aio_complete knows it doesn't need to
-	 * pull the iocb off the run list (We can't just call
-	 * INIT_LIST_HEAD because we don't want a kick_iocb to
-	 * queue this on the run list yet)
-	 */
-	iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
-	spin_unlock_irq(&ctx->ctx_lock);
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
 
-	/* Quit retrying if the i/o has been cancelled */
-	if (kiocbIsCancelled(iocb)) {
-		ret = -EINTR;
-		aio_complete(iocb, ret, 0);
-		/* must not access the iocb after this */
+	if (!table || id >= table->nr)
 		goto out;
-	}
 
-	/*
-	 * Now we are all set to call the retry method in async
-	 * context.
-	 */
-	ret = retry(iocb);
-
-	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
-		/*
-		 * There's no easy way to restart the syscall since other AIO's
-		 * may be already running. Just fail this IO with EINTR.
-		 */
-		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
-			     ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
-			ret = -EINTR;
-		aio_complete(iocb, ret, 0);
+	ctx = table->table[id];
+	if (ctx && ctx->user_id == ctx_id) {
+		percpu_ref_get(&ctx->users);
+		ret = ctx;
 	}
 out:
-	spin_lock_irq(&ctx->ctx_lock);
-
-	if (-EIOCBRETRY == ret) {
-		/*
-		 * OK, now that we are done with this iteration
-		 * and know that there is more left to go,
-		 * this is where we let go so that a subsequent
-		 * "kick" can start the next iteration
-		 */
-
-		/* will make __queue_kicked_iocb succeed from here on */
-		INIT_LIST_HEAD(&iocb->ki_run_list);
-		/* we must queue the next iteration ourselves, if it
-		 * has already been kicked */
-		if (kiocbIsKicked(iocb)) {
-			__queue_kicked_iocb(iocb);
-
-			/*
-			 * __queue_kicked_iocb will always return 1 here, because
-			 * iocb->ki_run_list is empty at this point so it should
-			 * be safe to unconditionally queue the context into the
-			 * work queue.
-			 */
-			aio_queue_work(ctx);
-		}
-	}
+	rcu_read_unlock();
 	return ret;
 }
 
-/*
- * __aio_run_iocbs:
- * 	Process all pending retries queued on the ioctx
- * 	run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
-{
-	struct kiocb *iocb;
-	struct list_head run_list;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	list_replace_init(&ctx->run_list, &run_list);
-	while (!list_empty(&run_list)) {
-		iocb = list_entry(run_list.next, struct kiocb,
-			ki_run_list);
-		list_del(&iocb->ki_run_list);
-		/*
-		 * Hold an extra reference while retrying i/o.
-		 */
-		iocb->ki_users++;       /* grab extra reference */
-		aio_run_iocb(iocb);
-		__aio_put_req(ctx, iocb);
- 	}
-	if (!list_empty(&ctx->run_list))
-		return 1;
-	return 0;
-}
-
-static void aio_queue_work(struct kioctx * ctx)
-{
-	unsigned long timeout;
-	/*
-	 * if someone is waiting, get the work started right
-	 * away, otherwise, use a longer delay
-	 */
-	smp_mb();
-	if (waitqueue_active(&ctx->wait))
-		timeout = 1;
-	else
-		timeout = HZ/10;
-	queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
-
-/*
- * aio_run_all_iocbs:
- *	Process all pending retries queued on the ioctx
- *	run list, and keep running them until the list
- *	stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
-	spin_lock_irq(&ctx->ctx_lock);
-	while (__aio_run_iocbs(ctx))
-		;
-	spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * aio_kick_handler:
- * 	Work queue handler triggered to process pending
- * 	retries on an ioctx. Takes on the aio issuer's
- *	mm context before running the iocbs, so that
- *	copy_xxx_user operates on the issuer's address
- *      space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
-	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
-	mm_segment_t oldfs = get_fs();
-	struct mm_struct *mm;
-	int requeue;
-
-	set_fs(USER_DS);
-	use_mm(ctx->mm);
-	spin_lock_irq(&ctx->ctx_lock);
-	requeue =__aio_run_iocbs(ctx);
-	mm = ctx->mm;
-	spin_unlock_irq(&ctx->ctx_lock);
- 	unuse_mm(mm);
-	set_fs(oldfs);
-	/*
-	 * we're in a worker thread already, don't use queue_delayed_work,
-	 */
-	if (requeue)
-		queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
-
-
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
-{
- 	struct kioctx	*ctx = iocb->ki_ctx;
-	unsigned long flags;
-	int run = 0;
-
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-	/* set this inside the lock so that we can't race with aio_run_iocb()
-	 * testing it and putting the iocb on the run list under the lock */
-	if (!kiocbTryKick(iocb))
-		run = __queue_kicked_iocb(iocb);
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	if (run)
-		aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- *      Called typically from a wait queue callback context
- *      to trigger a retry of the iocb.
- *      The retry is usually executed by aio workqueue
- *      threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
-	/* sync iocbs are easy: they can only ever be executing from a 
-	 * single context. */
-	if (is_sync_kiocb(iocb)) {
-		kiocbSetKicked(iocb);
-	        wake_up_process(iocb->ki_obj.tsk);
-		return;
-	}
-
-	try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
- *	Returns true if this is the last user of the request.  The 
- *	only other user of the request can be the cancellation code.
  */
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
-	struct aio_ring_info	*info;
 	struct aio_ring	*ring;
-	struct io_event	*event;
+	struct io_event	*ev_page, *event;
 	unsigned long	flags;
-	unsigned long	tail;
-	int		ret;
+	unsigned tail, pos;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -923,61 +959,64 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
 	if (is_sync_kiocb(iocb)) {
-		BUG_ON(iocb->ki_users != 1);
 		iocb->ki_user_data = res;
-		iocb->ki_users = 0;
+		smp_wmb();
+		iocb->ki_ctx = ERR_PTR(-EXDEV);
 		wake_up_process(iocb->ki_obj.tsk);
-		return 1;
+		return;
 	}
 
-	info = &ctx->ring_info;
+	if (iocb->ki_list.next) {
+		unsigned long flags;
 
-	/* add a completion event to the ring buffer.
-	 * must be done holding ctx->ctx_lock to prevent
-	 * other code from messing with the tail
-	 * pointer since we might be called from irq
-	 * context.
-	 */
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-
-	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
-		list_del_init(&iocb->ki_run_list);
+		spin_lock_irqsave(&ctx->ctx_lock, flags);
+		list_del(&iocb->ki_list);
+		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	}
 
 	/*
-	 * cancelled requests don't get events, userland was given one
-	 * when the event got cancelled.
+	 * Add a completion event to the ring buffer. Must be done holding
+	 * ctx->completion_lock to prevent other code from messing with the tail
+	 * pointer since we might be called from irq context.
 	 */
-	if (kiocbIsCancelled(iocb))
-		goto put_rq;
+	spin_lock_irqsave(&ctx->completion_lock, flags);
 
-	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+	tail = ctx->tail;
+	pos = tail + AIO_EVENTS_OFFSET;
 
-	tail = info->tail;
-	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
+	if (++tail >= ctx->nr_events)
 		tail = 0;
 
+	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
 	event->res = res;
 	event->res2 = res2;
 
-	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
-		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-		res, res2);
+	kunmap_atomic(ev_page);
+	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+
+	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+		 res, res2);
 
 	/* after flagging the request as done, we
 	 * must never even look at it again
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
-	info->tail = tail;
+	ctx->tail = tail;
+
+	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->tail = tail;
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
 
-	put_aio_ring_event(event, KM_IRQ0);
-	kunmap_atomic(ring, KM_IRQ1);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 
 	/*
 	 * Check if the user asked us to deliver the result through an
@@ -987,9 +1026,9 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	if (iocb->ki_eventfd != NULL)
 		eventfd_signal(iocb->ki_eventfd, 1);
 
-put_rq:
 	/* everything turned out well, dispose of the aiocb. */
-	ret = __aio_put_req(ctx, iocb);
+	kiocb_free(iocb);
+	put_reqs_available(ctx, 1);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -1002,235 +1041,136 @@ put_rq:
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	return ret;
+	percpu_ref_put(&ctx->reqs);
 }
 EXPORT_SYMBOL(aio_complete);
 
-/* aio_read_evt
- *	Pull an event off of the ioctx's event ring.  Returns the number of 
- *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
- *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ *	Pull an event off of the ioctx's event ring.  Returns the number of
+ *	events fetched
  */
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+				 struct io_event __user *event, long nr)
 {
-	struct aio_ring_info *info = &ioctx->ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
-
-	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
-	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-		 (unsigned long)ring->head, (unsigned long)ring->tail,
-		 (unsigned long)ring->nr);
-
-	if (ring->head == ring->tail)
-		goto out;
+	unsigned head, tail, pos;
+	long ret = 0;
+	int copy_ret;
 
-	spin_lock(&info->ring_lock);
-
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
-		*ent = *evp;
-		head = (head + 1) % info->nr;
-		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
-		ret = 1;
-		put_aio_ring_event(evp, KM_USER1);
-	}
-	spin_unlock(&info->ring_lock);
+	mutex_lock(&ctx->ring_lock);
 
-out:
-	kunmap_atomic(ring, KM_USER0);
-	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
-		 (unsigned long)ring->head, (unsigned long)ring->tail);
-	return ret;
-}
+	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
+	tail = ring->tail;
+	kunmap_atomic(ring);
 
-struct aio_timeout {
-	struct timer_list	timer;
-	int			timed_out;
-	struct task_struct	*p;
-};
+	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
 
-static void timeout_func(unsigned long data)
-{
-	struct aio_timeout *to = (struct aio_timeout *)data;
-
-	to->timed_out = 1;
-	wake_up_process(to->p);
-}
+	if (head == tail)
+		goto out;
 
-static inline void init_timeout(struct aio_timeout *to)
-{
-	setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
-	to->timed_out = 0;
-	to->p = current;
-}
+	head %= ctx->nr_events;
+	tail %= ctx->nr_events;
 
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
-			       const struct timespec *ts)
-{
-	to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
-	if (time_after(to->timer.expires, jiffies))
-		add_timer(&to->timer);
-	else
-		to->timed_out = 1;
-}
+	while (ret < nr) {
+		long avail;
+		struct io_event *ev;
+		struct page *page;
 
-static inline void clear_timeout(struct aio_timeout *to)
-{
-	del_singleshot_timer_sync(&to->timer);
-}
-
-static int read_events(struct kioctx *ctx,
-			long min_nr, long nr,
-			struct io_event __user *event,
-			struct timespec __user *timeout)
-{
-	long			start_jiffies = jiffies;
-	struct task_struct	*tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	int			ret;
-	int			i = 0;
-	struct io_event		ent;
-	struct aio_timeout	to;
-	int			retry = 0;
-
-	/* needed to zero any padding within an entry (there shouldn't be 
-	 * any, but C is fun!
-	 */
-	memset(&ent, 0, sizeof(ent));
-retry:
-	ret = 0;
-	while (likely(i < nr)) {
-		ret = aio_read_evt(ctx, &ent);
-		if (unlikely(ret <= 0))
+		avail = (head <= tail ?  tail : ctx->nr_events) - head;
+		if (head == tail)
 			break;
 
-		dprintk("read event: %Lx %Lx %Lx %Lx\n",
-			ent.data, ent.obj, ent.res, ent.res2);
+		avail = min(avail, nr - ret);
+		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
 
-		/* Could we split the check in two? */
-		ret = -EFAULT;
-		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-			dprintk("aio: lost an event due to EFAULT.\n");
-			break;
-		}
-		ret = 0;
+		pos = head + AIO_EVENTS_OFFSET;
+		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+		pos %= AIO_EVENTS_PER_PAGE;
 
-		/* Good, event copied to userland, update counts. */
-		event ++;
-		i ++;
-	}
+		ev = kmap(page);
+		copy_ret = copy_to_user(event + ret, ev + pos,
+					sizeof(*ev) * avail);
+		kunmap(page);
 
-	if (min_nr <= i)
-		return i;
-	if (ret)
-		return ret;
-
-	/* End fast path */
+		if (unlikely(copy_ret)) {
+			ret = -EFAULT;
+			goto out;
+		}
 
-	/* racey check, but it gets redone */
-	if (!retry && unlikely(!list_empty(&ctx->run_list))) {
-		retry = 1;
-		aio_run_all_iocbs(ctx);
-		goto retry;
+		ret += avail;
+		head += avail;
+		head %= ctx->nr_events;
 	}
 
-	init_timeout(&to);
-	if (timeout) {
-		struct timespec	ts;
-		ret = -EFAULT;
-		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
-			goto out;
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->head = head;
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
 
-		set_timeout(start_jiffies, &to, &ts);
-	}
+	pr_debug("%li  h%u t%u\n", ret, head, tail);
+out:
+	mutex_unlock(&ctx->ring_lock);
 
-	while (likely(i < nr)) {
-		add_wait_queue_exclusive(&ctx->wait, &wait);
-		do {
-			set_task_state(tsk, TASK_INTERRUPTIBLE);
-			ret = aio_read_evt(ctx, &ent);
-			if (ret)
-				break;
-			if (min_nr <= i)
-				break;
-			if (unlikely(ctx->dead)) {
-				ret = -EINVAL;
-				break;
-			}
-			if (to.timed_out)	/* Only check after read evt */
-				break;
-			/* Try to only show up in io wait if there are ops
-			 *  in flight */
-			if (ctx->reqs_active)
-				io_schedule();
-			else
-				schedule();
-			if (signal_pending(tsk)) {
-				ret = -EINTR;
-				break;
-			}
-			/*ret = aio_read_evt(ctx, &ent);*/
-		} while (1) ;
+	return ret;
+}
 
-		set_task_state(tsk, TASK_RUNNING);
-		remove_wait_queue(&ctx->wait, &wait);
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+			    struct io_event __user *event, long *i)
+{
+	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
 
-		if (unlikely(ret <= 0))
-			break;
+	if (ret > 0)
+		*i += ret;
 
-		ret = -EFAULT;
-		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-			dprintk("aio: lost an event due to EFAULT.\n");
-			break;
-		}
+	if (unlikely(atomic_read(&ctx->dead)))
+		ret = -EINVAL;
 
-		/* Good, event copied to userland, update counts. */
-		event ++;
-		i ++;
-	}
+	if (!*i)
+		*i = ret;
 
-	if (timeout)
-		clear_timeout(&to);
-out:
-	destroy_timer_on_stack(&to.timer);
-	return i ? i : ret;
+	return ret < 0 || *i >= min_nr;
 }
 
-/* Take an ioctx and remove it from the list of ioctx's.  Protects 
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+			struct io_event __user *event,
+			struct timespec __user *timeout)
 {
-	struct mm_struct *mm = current->mm;
-	int was_dead;
+	ktime_t until = { .tv64 = KTIME_MAX };
+	long ret = 0;
 
-	/* delete the entry from the list is someone else hasn't already */
-	spin_lock(&mm->ioctx_lock);
-	was_dead = ioctx->dead;
-	ioctx->dead = 1;
-	hlist_del_rcu(&ioctx->list);
-	spin_unlock(&mm->ioctx_lock);
+	if (timeout) {
+		struct timespec	ts;
 
-	dprintk("aio_release(%p)\n", ioctx);
-	if (likely(!was_dead))
-		put_ioctx(ioctx);	/* twice for the list */
+		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+			return -EFAULT;
 
-	aio_cancel_all(ioctx);
-	wait_for_all_aios(ioctx);
+		until = timespec_to_ktime(ts);
+	}
 
 	/*
-	 * Wake up any waiters.  The setting of ctx->dead must be seen
-	 * by other CPUs at this point.  Right now, we rely on the
-	 * locking done by the above calls to ensure this consistency.
+	 * Note that aio_read_events() is being called as the conditional - i.e.
+	 * we're calling it after prepare_to_wait() has set task state to
+	 * TASK_INTERRUPTIBLE.
+	 *
+	 * But aio_read_events() can block, and if it blocks it's going to flip
+	 * the task state back to TASK_RUNNING.
+	 *
+	 * This should be ok, provided it doesn't flip the state back to
+	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
+	 * will only happen if the mutex_lock() call blocks, and we then find
+	 * the ringbuffer empty. So in practice we should be ok, but it's
+	 * something to be aware of when touching this code.
 	 */
-	wake_up_all(&ioctx->wait);
-	put_ioctx(ioctx);	/* once for the lookup */
+	wait_event_interruptible_hrtimeout(ctx->wait,
+			aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+
+	return ret;
 }
 
 /* sys_io_setup:
@@ -1267,11 +1207,9 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
 		ret = put_user(ioctx->user_id, ctxp);
-		if (!ret)
-			return 0;
-
-		get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
-		io_destroy(ioctx);
+		if (ret)
+			kill_ioctx(current->mm, ioctx, NULL);
+		percpu_ref_put(&ioctx->users);
 	}
 
 out:
@@ -1288,144 +1226,73 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
-		io_destroy(ioctx);
-		return 0;
-	}
-	pr_debug("EINVAL: io_destroy: invalid context id\n");
-	return -EINVAL;
-}
+		struct completion requests_done =
+			COMPLETION_INITIALIZER_ONSTACK(requests_done);
+		int ret;
 
-static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
-{
-	struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
-
-	BUG_ON(ret <= 0);
-
-	while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
-		ssize_t this = min((ssize_t)iov->iov_len, ret);
-		iov->iov_base += this;
-		iov->iov_len -= this;
-		iocb->ki_left -= this;
-		ret -= this;
-		if (iov->iov_len == 0) {
-			iocb->ki_cur_seg++;
-			iov++;
-		}
-	}
+		/* Pass requests_done to kill_ioctx() where it can be set
+		 * in a thread-safe way. If we try to set it here then we have
+		 * a race condition if two io_destroy() called simultaneously.
+		 */
+		ret = kill_ioctx(current->mm, ioctx, &requests_done);
+		percpu_ref_put(&ioctx->users);
 
-	/* the caller should not have done more io than what fit in
-	 * the remaining iovecs */
-	BUG_ON(ret > 0 && iocb->ki_left == 0);
-}
+		/* Wait until all IO for the context are done. Otherwise kernel
+		 * keep using user-space buffers even if user thinks the context
+		 * is destroyed.
+		 */
+		if (!ret)
+			wait_for_completion(&requests_done);
 
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
-			 unsigned long, loff_t);
-	ssize_t ret = 0;
-	unsigned short opcode;
-
-	if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
-		(iocb->ki_opcode == IOCB_CMD_PREAD)) {
-		rw_op = file->f_op->aio_read;
-		opcode = IOCB_CMD_PREADV;
-	} else {
-		rw_op = file->f_op->aio_write;
-		opcode = IOCB_CMD_PWRITEV;
+		return ret;
 	}
-
-	/* This matches the pread()/pwrite() logic */
-	if (iocb->ki_pos < 0)
-		return -EINVAL;
-
-	do {
-		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
-			    iocb->ki_nr_segs - iocb->ki_cur_seg,
-			    iocb->ki_pos);
-		if (ret > 0)
-			aio_advance_iovec(iocb, ret);
-
-	/* retry all partial writes.  retry partial reads as long as its a
-	 * regular file. */
-	} while (ret > 0 && iocb->ki_left > 0 &&
-		 (opcode == IOCB_CMD_PWRITEV ||
-		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
-
-	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
-	if ((ret == 0) || (iocb->ki_left == 0))
-		ret = iocb->ki_nbytes - iocb->ki_left;
-
-	/* If we managed to write some out we return that, rather than
-	 * the eventual error. */
-	if (opcode == IOCB_CMD_PWRITEV
-	    && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
-	    && iocb->ki_nbytes - iocb->ki_left)
-		ret = iocb->ki_nbytes - iocb->ki_left;
-
-	return ret;
-}
-
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t ret = -EINVAL;
-
-	if (file->f_op->aio_fsync)
-		ret = file->f_op->aio_fsync(iocb, 1);
-	return ret;
+	pr_debug("EINVAL: io_destroy: invalid context id\n");
+	return -EINVAL;
 }
 
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t ret = -EINVAL;
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+			    unsigned long, loff_t);
+typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
 
-	if (file->f_op->aio_fsync)
-		ret = file->f_op->aio_fsync(iocb, 0);
-	return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
+				     int rw, char __user *buf,
+				     unsigned long *nr_segs,
+				     struct iovec **iovec,
+				     bool compat)
 {
 	ssize_t ret;
 
+	*nr_segs = kiocb->ki_nbytes;
+
 #ifdef CONFIG_COMPAT
 	if (compat)
-		ret = compat_rw_copy_check_uvector(type,
-				(struct compat_iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
-				&kiocb->ki_iovec);
+		ret = compat_rw_copy_check_uvector(rw,
+				(struct compat_iovec __user *)buf,
+				*nr_segs, 1, *iovec, iovec);
 	else
 #endif
-		ret = rw_copy_check_uvector(type,
-				(struct iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
-				&kiocb->ki_iovec);
+		ret = rw_copy_check_uvector(rw,
+				(struct iovec __user *)buf,
+				*nr_segs, 1, *iovec, iovec);
 	if (ret < 0)
-		goto out;
+		return ret;
 
-	kiocb->ki_nr_segs = kiocb->ki_nbytes;
-	kiocb->ki_cur_seg = 0;
-	/* ki_nbytes/left now reflect bytes instead of segs */
+	/* ki_nbytes now reflect bytes instead of segs */
 	kiocb->ki_nbytes = ret;
-	kiocb->ki_left = ret;
-
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
-static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
+				       int rw, char __user *buf,
+				       unsigned long *nr_segs,
+				       struct iovec *iovec)
 {
-	kiocb->ki_iovec = &kiocb->ki_inline_vec;
-	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
-	kiocb->ki_iovec->iov_len = kiocb->ki_left;
-	kiocb->ki_nr_segs = 1;
-	kiocb->ki_cur_seg = 0;
+	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
+		return -EFAULT;
+
+	iovec->iov_base = buf;
+	iovec->iov_len = kiocb->ki_nbytes;
+	*nr_segs = 1;
 	return 0;
 }
 
@@ -1434,156 +1301,125 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
  *	Performs the initial checks and aio retry method
  *	setup for the kiocb at the time of io submission.
  */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
+			    char __user *buf, bool compat)
 {
-	struct file *file = kiocb->ki_filp;
-	ssize_t ret = 0;
-
-	switch (kiocb->ki_opcode) {
+	struct file *file = req->ki_filp;
+	ssize_t ret;
+	unsigned long nr_segs;
+	int rw;
+	fmode_t mode;
+	aio_rw_op *rw_op;
+	rw_iter_op *iter_op;
+	struct iovec inline_vec, *iovec = &inline_vec;
+	struct iov_iter iter;
+
+	switch (opcode) {
 	case IOCB_CMD_PREAD:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_READ)))
-			break;
-		ret = -EFAULT;
-		if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
-			kiocb->ki_left)))
-			break;
-		ret = security_file_permission(file, MAY_READ);
-		if (unlikely(ret))
-			break;
-		ret = aio_setup_single_vector(kiocb);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
-	case IOCB_CMD_PWRITE:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_WRITE)))
-			break;
-		ret = -EFAULT;
-		if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
-			kiocb->ki_left)))
-			break;
-		ret = security_file_permission(file, MAY_WRITE);
-		if (unlikely(ret))
-			break;
-		ret = aio_setup_single_vector(kiocb);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
 	case IOCB_CMD_PREADV:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_READ)))
-			break;
-		ret = security_file_permission(file, MAY_READ);
-		if (unlikely(ret))
-			break;
-		ret = aio_setup_vectored_rw(READ, kiocb, compat);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
+		mode	= FMODE_READ;
+		rw	= READ;
+		rw_op	= file->f_op->aio_read;
+		iter_op	= file->f_op->read_iter;
+		goto rw_common;
+
+	case IOCB_CMD_PWRITE:
 	case IOCB_CMD_PWRITEV:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_WRITE)))
-			break;
-		ret = security_file_permission(file, MAY_WRITE);
-		if (unlikely(ret))
-			break;
-		ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
-		if (ret)
+		mode	= FMODE_WRITE;
+		rw	= WRITE;
+		rw_op	= file->f_op->aio_write;
+		iter_op	= file->f_op->write_iter;
+		goto rw_common;
+rw_common:
+		if (unlikely(!(file->f_mode & mode)))
+			return -EBADF;
+
+		if (!rw_op && !iter_op)
+			return -EINVAL;
+
+		ret = (opcode == IOCB_CMD_PREADV ||
+		       opcode == IOCB_CMD_PWRITEV)
+			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+						&iovec, compat)
+			: aio_setup_single_vector(req, rw, buf, &nr_segs,
+						  iovec);
+		if (!ret)
+			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+		if (ret < 0) {
+			if (iovec != &inline_vec)
+				kfree(iovec);
+			return ret;
+		}
+
+		req->ki_nbytes = ret;
+
+		/* XXX: move/kill - rw_verify_area()? */
+		/* This matches the pread()/pwrite() logic */
+		if (req->ki_pos < 0) {
+			ret = -EINVAL;
 			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_rw_vect_retry;
+		}
+
+		if (rw == WRITE)
+			file_start_write(file);
+
+		if (iter_op) {
+			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+			ret = iter_op(req, &iter);
+		} else {
+			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+		}
+
+		if (rw == WRITE)
+			file_end_write(file);
 		break;
+
 	case IOCB_CMD_FDSYNC:
-		ret = -EINVAL;
-		if (file->f_op->aio_fsync)
-			kiocb->ki_retry = aio_fdsync;
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 1);
 		break;
+
 	case IOCB_CMD_FSYNC:
-		ret = -EINVAL;
-		if (file->f_op->aio_fsync)
-			kiocb->ki_retry = aio_fsync;
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 0);
 		break;
+
 	default:
-		dprintk("EINVAL: io_submit: no operation provided\n");
-		ret = -EINVAL;
+		pr_debug("EINVAL: no operation provided\n");
+		return -EINVAL;
 	}
 
-	if (!kiocb->ki_retry)
-		return ret;
+	if (iovec != &inline_vec)
+		kfree(iovec);
 
-	return 0;
-}
-
-static void aio_batch_add(struct address_space *mapping,
-			  struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos;
-	unsigned bucket;
-
-	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
-	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
-		if (abe->mapping == mapping)
-			return;
+	if (ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND ||
+			     ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
+		aio_complete(req, ret, 0);
 	}
 
-	abe = mempool_alloc(abe_pool, GFP_KERNEL);
-
-	/*
-	 * we should be using igrab here, but
-	 * we don't want to hammer on the global
-	 * inode spinlock just to take an extra
-	 * reference on a file that we must already
-	 * have a reference to.
-	 *
-	 * When we're called, we always have a reference
-	 * on the file, so we must always have a reference
-	 * on the inode, so ihold() is safe here.
-	 */
-	ihold(mapping->host);
-	abe->mapping = mapping;
-	hlist_add_head(&abe->list, &batch_hash[bucket]);
-	return;
-}
-
-static void aio_batch_free(struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos, *n;
-	int i;
-
-	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-			blk_run_address_space(abe->mapping);
-			iput(abe->mapping->host);
-			hlist_del(&abe->list);
-			mempool_free(abe, abe_pool);
-		}
-	}
+	return 0;
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct hlist_head *batch_hash,
-			 bool compat)
+			 struct iocb *iocb, bool compat)
 {
 	struct kiocb *req;
-	struct file *file;
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
 	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
-		pr_debug("EINVAL: io_submit: reserve field set\n");
+		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
 
@@ -1597,16 +1433,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		return -EINVAL;
 	}
 
-	file = fget(iocb->aio_fildes);
-	if (unlikely(!file))
-		return -EBADF;
-
-	req = aio_get_req(ctx);		/* returns with 2 references to req */
-	if (unlikely(!req)) {
-		fput(file);
+	req = aio_get_req(ctx);
+	if (unlikely(!req))
 		return -EAGAIN;
+
+	req->ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->ki_filp)) {
+		ret = -EBADF;
+		goto out_put_req;
 	}
-	req->ki_filp = file;
+
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1622,62 +1458,28 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		}
 	}
 
-	ret = put_user(req->ki_key, &user_iocb->aio_key);
+	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
-		dprintk("EFAULT: aio_key\n");
+		pr_debug("EFAULT: aio_key\n");
 		goto out_put_req;
 	}
 
 	req->ki_obj.user = user_iocb;
 	req->ki_user_data = iocb->aio_data;
 	req->ki_pos = iocb->aio_offset;
+	req->ki_nbytes = iocb->aio_nbytes;
 
-	req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
-	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
-	req->ki_opcode = iocb->aio_lio_opcode;
-
-	ret = aio_setup_iocb(req, compat);
-
+	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+			   (char __user *)(unsigned long)iocb->aio_buf,
+			   compat);
 	if (ret)
 		goto out_put_req;
 
-	spin_lock_irq(&ctx->ctx_lock);
-	/*
-	 * We could have raced with io_destroy() and are currently holding a
-	 * reference to ctx which should be destroyed. We cannot submit IO
-	 * since ctx gets freed as soon as io_submit() puts its reference.  The
-	 * check here is reliable: io_destroy() sets ctx->dead before waiting
-	 * for outstanding IO and the barrier between these two is realized by
-	 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
-	 * increment ctx->reqs_active before checking for ctx->dead and the
-	 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
-	 * don't see ctx->dead set here, io_destroy() waits for our IO to
-	 * finish.
-	 */
-	if (ctx->dead) {
-		spin_unlock_irq(&ctx->ctx_lock);
-		ret = -EINVAL;
-		goto out_put_req;
-	}
-	aio_run_iocb(req);
-	if (!list_empty(&ctx->run_list)) {
-		/* drain the run list */
-		while (__aio_run_iocbs(ctx))
-			;
-	}
-	spin_unlock_irq(&ctx->ctx_lock);
-	if (req->ki_opcode == IOCB_CMD_PREAD ||
-	    req->ki_opcode == IOCB_CMD_PREADV ||
-	    req->ki_opcode == IOCB_CMD_PWRITE ||
-	    req->ki_opcode == IOCB_CMD_PWRITEV)
-		aio_batch_add(file->f_mapping, batch_hash);
-
-	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
-
 out_put_req:
-	aio_put_req(req);	/* drop extra ref to req */
-	aio_put_req(req);	/* drop i/o ref to req */
+	put_reqs_available(ctx, 1);
+	percpu_ref_put(&ctx->reqs);
+	kiocb_free(req);
 	return ret;
 }
 
@@ -1686,8 +1488,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 {
 	struct kioctx *ctx;
 	long ret = 0;
-	int i;
-	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
+	int i = 0;
+	struct blk_plug plug;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1700,10 +1502,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx)) {
-		pr_debug("EINVAL: io_submit: invalid context id\n");
+		pr_debug("EINVAL: invalid context id\n");
 		return -EINVAL;
 	}
 
+	blk_start_plug(&plug);
+
 	/*
 	 * AKPM: should this return a partial result if some of the IOs were
 	 * successfully submitted?
@@ -1722,13 +1526,13 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
 		if (ret)
 			break;
 	}
-	aio_batch_free(batch_hash);
+	blk_finish_plug(&plug);
 
-	put_ioctx(ctx);
+	percpu_ref_put(&ctx->users);
 	return i ? i : ret;
 }
 
@@ -1760,10 +1564,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (key != KIOCB_KEY)
+		return NULL;
+
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each(pos, &ctx->active_reqs) {
 		struct kiocb *kiocb = list_kiocb(pos);
-		if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+		if (kiocb->ki_obj.user == iocb)
 			return kiocb;
 	}
 	return NULL;
@@ -1782,7 +1589,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		struct io_event __user *, result)
 {
-	int (*cancel)(struct kiocb *iocb, struct io_event *res);
 	struct kioctx *ctx;
 	struct kiocb *kiocb;
 	u32 key;
@@ -1797,34 +1603,25 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	ret = -EAGAIN;
+
 	kiocb = lookup_kiocb(ctx, iocb, key);
-	if (kiocb && kiocb->ki_cancel) {
-		cancel = kiocb->ki_cancel;
-		kiocb->ki_users ++;
-		kiocbSetCancelled(kiocb);
-	} else
-		cancel = NULL;
+	if (kiocb)
+		ret = kiocb_cancel(kiocb);
+	else
+		ret = -EINVAL;
+
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	if (NULL != cancel) {
-		struct io_event tmp;
-		pr_debug("calling cancel\n");
-		memset(&tmp, 0, sizeof(tmp));
-		tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
-		tmp.data = kiocb->ki_user_data;
-		ret = cancel(kiocb, &tmp);
-		if (!ret) {
-			/* Cancellation succeeded -- copy the result
-			 * into the user's buffer.
-			 */
-			if (copy_to_user(result, &tmp, sizeof(tmp)))
-				ret = -EFAULT;
-		}
-	} else
-		ret = -EINVAL;
+	if (!ret) {
+		/*
+		 * The result argument is no longer used - the io_event is
+		 * always delivered via the ring buffer. -EINPROGRESS indicates
+		 * cancellation is progress:
+		 */
+		ret = -EINPROGRESS;
+	}
 
-	put_ioctx(ctx);
+	percpu_ref_put(&ctx->users);
 
 	return ret;
 }
@@ -1839,8 +1636,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  *	< min_nr if the timeout specified by timeout has elapsed
  *	before sufficient events are available, where timeout == NULL
  *	specifies an infinite timeout. Note that the timeout pointed to by
- *	timeout is relative and will be updated if not NULL and the
- *	operation blocks. Will fail with -ENOSYS if not implemented.
+ *	timeout is relative.  Will fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		long, min_nr,
@@ -1854,9 +1650,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	if (likely(ioctx)) {
 		if (likely(min_nr <= nr && min_nr >= 0))
 			ret = read_events(ioctx, min_nr, nr, events, timeout);
-		put_ioctx(ioctx);
+		percpu_ref_put(&ioctx->users);
 	}
-
-	asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
 	return ret;
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c5567cb7843..80ef38c73e5 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,7 +24,6 @@
 
 static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
-static const struct file_operations anon_inode_fops;
 
 /*
  * anon_inodefs_dname() is called from d_path().
@@ -52,19 +51,6 @@ static struct file_system_type anon_inode_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-/*
- * nop .set_page_dirty method so that people can use .page_mkwrite on
- * anon inodes.
- */
-static int anon_set_page_dirty(struct page *page)
-{
-	return 0;
-};
-
-static const struct address_space_operations anon_aops = {
-	.set_page_dirty = anon_set_page_dirty,
-};
-
 /**
  * anon_inode_getfile - creates a new file instance by hooking it up to an
  *                      anonymous inode, and a dentry that describe the "class"
@@ -88,7 +74,6 @@ struct file *anon_inode_getfile(const char *name,
 	struct qstr this;
 	struct path path;
 	struct file *file;
-	int error;
 
 	if (IS_ERR(anon_inode_inode))
 		return ERR_PTR(-ENODEV);
@@ -100,7 +85,7 @@ struct file *anon_inode_getfile(const char *name,
 	 * Link the inode to a directory entry by creating a unique name
 	 * using the inode sequence number.
 	 */
-	error = -ENOMEM;
+	file = ERR_PTR(-ENOMEM);
 	this.name = name;
 	this.len = strlen(name);
 	this.hash = 0;
@@ -117,15 +102,12 @@ struct file *anon_inode_getfile(const char *name,
 
 	d_instantiate(path.dentry, anon_inode_inode);
 
-	error = -ENFILE;
 	file = alloc_file(&path, OPEN_FMODE(flags), fops);
-	if (!file)
+	if (IS_ERR(file))
 		goto err_dput;
 	file->f_mapping = anon_inode_inode->i_mapping;
 
-	file->f_pos = 0;
 	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
-	file->f_version = 0;
 	file->private_data = priv;
 
 	return file;
@@ -134,7 +116,7 @@ err_dput:
 	path_put(&path);
 err_module:
 	module_put(fops->owner);
-	return ERR_PTR(error);
+	return file;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfile);
 
@@ -180,64 +162,17 @@ err_put_unused_fd:
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
-static struct inode *anon_inode_mkinode(void)
-{
-	struct inode *inode = new_inode(anon_inode_mnt->mnt_sb);
-
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	inode->i_ino = get_next_ino();
-	inode->i_fop = &anon_inode_fops;
-
-	inode->i_mapping->a_ops = &anon_aops;
-
-	/*
-	 * Mark the inode dirty from the very beginning,
-	 * that way it will never be moved to the dirty
-	 * list because mark_inode_dirty() will think
-	 * that it already _is_ on the dirty list.
-	 */
-	inode->i_state = I_DIRTY;
-	inode->i_mode = S_IRUSR | S_IWUSR;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
-	inode->i_flags |= S_PRIVATE;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	return inode;
-}
-
 static int __init anon_inode_init(void)
 {
-	int error;
-
-	error = register_filesystem(&anon_inode_fs_type);
-	if (error)
-		goto err_exit;
 	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
-	if (IS_ERR(anon_inode_mnt)) {
-		error = PTR_ERR(anon_inode_mnt);
-		goto err_unregister_filesystem;
-	}
-	anon_inode_inode = anon_inode_mkinode();
-	if (IS_ERR(anon_inode_inode)) {
-		error = PTR_ERR(anon_inode_inode);
-		goto err_mntput;
-	}
+	if (IS_ERR(anon_inode_mnt))
+		panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
 
-	return 0;
+	anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	if (IS_ERR(anon_inode_inode))
+		panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
 
-err_mntput:
-	mntput(anon_inode_mnt);
-err_unregister_filesystem:
-	unregister_filesystem(&anon_inode_fs_type);
-err_exit:
-	panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
+	return 0;
 }
 
 fs_initcall(anon_inode_init);
diff --git a/fs/attr.c b/fs/attr.c
index 1007ed61631..6530ced1969 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -5,7 +5,7 @@
  *  changes by Thomas Schoebel-Theuer
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/string.h>
@@ -13,6 +13,8 @@
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/ima.h>
 
 /**
  * inode_change_ok - check if attribute changes to an inode are allowed
@@ -46,15 +48,16 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
-	    (current_fsuid() != inode->i_uid ||
-	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
+	    (!uid_eq(current_fsuid(), inode->i_uid) ||
+	     !uid_eq(attr->ia_uid, inode->i_uid)) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
-	    (current_fsuid() != inode->i_uid ||
-	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
-	    !capable(CAP_CHOWN))
+	    (!uid_eq(current_fsuid(), inode->i_uid) ||
+	    (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
@@ -63,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-				inode->i_gid) && !capable(CAP_FSETID))
+				inode->i_gid) &&
+		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -128,7 +132,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
  * setattr_copy must be called with i_mutex held.
  *
  * setattr_copy updates the inode's metadata with that specified
- * in attr. Noticably missing is inode size update, which is more complex
+ * in attr. Noticeably missing is inode size update, which is more complex
  * as it requires pagecache updates.
  *
  * The inode is not marked as dirty after this operation. The rationale is
@@ -155,26 +159,56 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		if (!in_group_p(inode->i_gid) &&
+		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
 }
 EXPORT_SYMBOL(setattr_copy);
 
-int notify_change(struct dentry * dentry, struct iattr * attr)
+/**
+ * notify_change - modify attributes of a filesytem object
+ * @dentry:	object affected
+ * @iattr:	new attributes
+ * @delegated_inode: returns inode, if the inode is delegated
+ *
+ * The caller must hold the i_mutex on the affected object.
+ *
+ * If notify_change discovers a delegation in need of breaking,
+ * it will return -EWOULDBLOCK and return a reference to the inode in
+ * delegated_inode.  The caller should then break the delegation and
+ * retry.  Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.  Also, passing NULL is fine for callers holding
+ * the file open for write, as there can be no conflicting delegation in
+ * that case.
+ */
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
-	mode_t mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 	int error;
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
 
+	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 			return -EPERM;
 	}
 
+	if ((ia_valid & ATTR_MODE)) {
+		umode_t amode = attr->ia_mode;
+		/* Flag setting protected by i_mutex */
+		if (is_sxid(amode))
+			inode->i_flags &= ~S_NOSEC;
+	}
+
 	now = current_fs_time(inode->i_sb);
 
 	attr->ia_ctime = now;
@@ -224,22 +258,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		return error;
-
-	if (ia_valid & ATTR_SIZE)
-		down_write(&dentry->d_inode->i_alloc_sem);
+	error = try_break_deleg(inode, delegated_inode);
+	if (error)
+		return error;
 
 	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(dentry, attr);
 	else
 		error = simple_setattr(dentry, attr);
 
-	if (ia_valid & ATTR_SIZE)
-		up_write(&dentry->d_inode->i_alloc_sem);
-
-	if (!error)
+	if (!error) {
 		fsnotify_change(dentry, ia_valid);
+		ima_inode_post_setattr(dentry);
+		evm_inode_post_setattr(dentry, ia_valid);
+	}
 
 	return error;
 }
-
 EXPORT_SYMBOL(notify_change);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 475f9c597cb..acf32054edd 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -39,27 +39,17 @@
 
 /* #define DEBUG */
 
-#ifdef DEBUG
-#define DPRINTK(fmt, args...)				\
-do {							\
-	printk(KERN_DEBUG "pid %d: %s: " fmt "\n",	\
-		current->pid, __func__, ##args);	\
-} while (0)
-#else
-#define DPRINTK(fmt, args...) do {} while (0)
-#endif
-
-#define AUTOFS_WARN(fmt, args...)			\
-do {							\
+#define DPRINTK(fmt, ...)				\
+	pr_debug("pid %d: %s: " fmt "\n",		\
+		current->pid, __func__, ##__VA_ARGS__)
+
+#define AUTOFS_WARN(fmt, ...)				\
 	printk(KERN_WARNING "pid %d: %s: " fmt "\n",	\
-		current->pid, __func__, ##args);	\
-} while (0)
+		current->pid, __func__, ##__VA_ARGS__)
 
-#define AUTOFS_ERROR(fmt, args...)			\
-do {							\
+#define AUTOFS_ERROR(fmt, ...)				\
 	printk(KERN_ERR "pid %d: %s: " fmt "\n",	\
-		current->pid, __func__, ##args);	\
-} while (0)
+		current->pid, __func__, ##__VA_ARGS__)
 
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
@@ -84,8 +74,8 @@ struct autofs_info {
 	unsigned long last_used;
 	atomic_t count;
 
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
@@ -99,8 +89,8 @@ struct autofs_wait_queue {
 	struct qstr name;
 	u32 dev;
 	u64 ino;
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 	pid_t pid;
 	pid_t tgid;
 	/* This is for status reporting upon return */
@@ -114,7 +104,7 @@ struct autofs_sb_info {
 	u32 magic;
 	int pipefd;
 	struct file *pipe;
-	pid_t oz_pgrp;
+	struct pid *oz_pgrp;
 	int catatonic;
 	int version;
 	int sub_version;
@@ -126,11 +116,13 @@ struct autofs_sb_info {
 	int needs_reghost;
 	struct super_block *sb;
 	struct mutex wq_mutex;
+	struct mutex pipe_mutex;
 	spinlock_t fs_lock;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
 	spinlock_t lookup_lock;
 	struct list_head active_list;
 	struct list_head expiring_list;
+	struct rcu_head rcu;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -148,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
    filesystem without "magic".) */
 
 static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
-	return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
 
 /* Does a dentry have some pending activity? */
@@ -165,7 +157,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 	return 0;
 }
 
-struct inode *autofs4_get_inode(struct super_block *, mode_t);
+struct inode *autofs4_get_inode(struct super_block *, umode_t);
 void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
@@ -278,6 +270,17 @@ int autofs4_fill_super(struct super_block *, void *, int);
 struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
 void autofs4_clean_ino(struct autofs_info *);
 
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+	if (!pipe->f_op->write)
+		return -EINVAL;
+	if (!S_ISFIFO(file_inode(pipe)->i_mode))
+		return -EINVAL;
+	/* We want a packet pipe */
+	pipe->f_flags |= O_DIRECT;
+	return 0;
+}
+
 /* Queue management functions */
 
 int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 509fe1eb66a..5b570b6efa2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
 	if (tmp.size < sizeof(tmp))
 		return ERR_PTR(-EINVAL);
 
+	if (tmp.size > (PATH_MAX + sizeof(tmp)))
+		return ERR_PTR(-ENAMETOOLONG);
+
 	return memdup_user(in, tmp.size);
 }
 
@@ -159,7 +162,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
 	struct inode *inode;
 
 	if (f) {
-		inode = f->f_path.dentry->d_inode;
+		inode = file_inode(f);
 		sbi = autofs4_sbi(inode->i_sb);
 	}
 	return sbi;
@@ -183,24 +186,24 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 	return 0;
 }
 
+/* Find the topmost mount satisfying test() */
 static int find_autofs_mount(const char *pathname,
 			     struct path *res,
 			     int test(struct path *path, void *data),
 			     void *data)
 {
 	struct path path;
-	int err = kern_path(pathname, 0, &path);
+	int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
 	if (err)
 		return err;
 	err = -ENOENT;
 	while (path.dentry == path.mnt->mnt_root) {
-		if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
 			if (test(&path, data)) {
 				path_get(&path);
-				if (!err) /* already found some */
-					path_put(res);
 				*res = path;
 				err = 0;
+				break;
 			}
 		}
 		if (!follow_up(&path))
@@ -212,7 +215,7 @@ static int find_autofs_mount(const char *pathname,
 
 static int test_by_dev(struct path *path, void *p)
 {
-	return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
+	return path->dentry->d_sb->s_dev == *(dev_t *)p;
 }
 
 static int test_by_type(struct path *path, void *p)
@@ -221,20 +224,6 @@ static int test_by_type(struct path *path, void *p)
 	return ino && ino->sbi->type & *(unsigned *)p;
 }
 
-static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	FD_SET(fd, fdt->close_on_exec);
-	spin_unlock(&files->file_lock);
-}
-
-
 /*
  * Open a file descriptor on the autofs mount point corresponding
  * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
@@ -243,7 +232,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
 	int err, fd;
 
-	fd = get_unused_fd();
+	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (likely(fd >= 0)) {
 		struct file *filp;
 		struct path path;
@@ -257,14 +246,14 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 		 * corresponding to the autofs fs we want to open.
 		 */
 
-		filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
-				   current_cred());
+		filp = dentry_open(&path, O_RDONLY, current_cred());
+		path_put(&path);
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
 			goto out;
 		}
 
-		autofs_dev_ioctl_fd_install(fd, filp);
+		fd_install(fd, filp);
 	}
 
 	return fd;
@@ -360,6 +349,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 {
 	int pipefd;
 	int err = 0;
+	struct pid *new_pid = NULL;
 
 	if (param->setpipefd.pipefd == -1)
 		return -EINVAL;
@@ -371,22 +361,33 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		mutex_unlock(&sbi->wq_mutex);
 		return -EBUSY;
 	} else {
-		struct file *pipe = fget(pipefd);
+		struct file *pipe;
+
+		new_pid = get_task_pid(current, PIDTYPE_PGID);
+
+		if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+			AUTOFS_WARN("Not allowed to change PID namespace");
+			err = -EINVAL;
+			goto out;
+		}
+
+		pipe = fget(pipefd);
 		if (!pipe) {
 			err = -EBADF;
 			goto out;
 		}
-		if (!pipe->f_op || !pipe->f_op->write) {
+		if (autofs_prepare_pipe(pipe) < 0) {
 			err = -EPIPE;
 			fput(pipe);
 			goto out;
 		}
-		sbi->oz_pgrp = task_pgrp_nr(current);
+		swap(sbi->oz_pgrp, new_pid);
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
 		sbi->catatonic = 0;
 	}
 out:
+	put_pid(new_pid);
 	mutex_unlock(&sbi->wq_mutex);
 	return err;
 }
@@ -451,8 +452,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		err = 0;
 		autofs4_expire_wait(path.dentry);
 		spin_lock(&sbi->fs_lock);
-		param->requester.uid = ino->uid;
-		param->requester.gid = ino->gid;
+		param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
+		param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
 		spin_unlock(&sbi->fs_lock);
 	}
 	path_put(&path);
@@ -500,12 +501,11 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
  * mount if there is one or 0 if it isn't a mountpoint.
  *
  * If we aren't supplied with a file descriptor then we
- * lookup the nameidata of the path and check if it is the
- * root of a mount. If a type is given we are looking for
- * a particular autofs mount and if we don't find a match
- * we return fail. If the located nameidata path is the
- * root of a mount we return 1 along with the super magic
- * of the mount or 0 otherwise.
+ * lookup the path and check if it is the root of a mount.
+ * If a type is given we are looking for a particular autofs
+ * mount and if we don't find a match we return fail. If the
+ * located path is the root of a mount we return 1 along with
+ * the super magic of the mount or 0 otherwise.
  *
  * In both cases the the device number (as returned by
  * new_encode_dev()) is also returned.
@@ -533,16 +533,18 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 	if (!fp || param->ioctlfd == -1) {
 		if (autofs_type_any(type))
-			err = kern_path(name, LOOKUP_FOLLOW, &path);
+			err = kern_path_mountpoint(AT_FDCWD,
+						   name, &path, LOOKUP_FOLLOW);
 		else
-			err = find_autofs_mount(name, &path, test_by_type, &type);
+			err = find_autofs_mount(name, &path,
+						test_by_type, &type);
 		if (err)
 			goto out;
-		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
+		devid = new_encode_dev(path.dentry->d_sb->s_dev);
 		err = 0;
 		if (path.mnt->mnt_root == path.dentry) {
 			err = 1;
-			magic = path.mnt->mnt_sb->s_magic;
+			magic = path.dentry->d_sb->s_magic;
 		}
 	} else {
 		dev_t dev = sbi->sb->s_dev;
@@ -556,7 +558,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		err = have_submounts(path.dentry);
 
 		if (follow_down_one(&path))
-			magic = path.mnt->mnt_sb->s_magic;
+			magic = path.dentry->d_sb->s_magic;
 	}
 
 	param->ismountpoint.out.devid = devid;
@@ -671,12 +673,6 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
 			goto out;
 		}
 
-		if (!fp->f_op) {
-			err = -ENOTTY;
-			fput(fp);
-			goto out;
-		}
-
 		sbi = autofs_dev_ioctl_sbi(fp);
 		if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
 			err = -EINVAL;
@@ -741,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
 MODULE_ALIAS("devname:autofs");
 
 /* Register/deregister misc character device */
-int autofs_dev_ioctl_init(void)
+int __init autofs_dev_ioctl_init(void)
 {
 	int r;
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 450f529a4ea..394e90b02c5 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -61,15 +61,6 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		/* This is an autofs submount, we can't expire it */
 		if (autofs_type_indirect(sbi->type))
 			goto done;
-
-		/*
-		 * Otherwise it's an offset mount and we need to check
-		 * if we can umount its mount, if there is one.
-		 */
-		if (!d_mountpoint(path.dentry)) {
-			status = 0;
-			goto done;
-		}
 	}
 
 	/* Update the expiry counter if fs is busy */
@@ -94,25 +85,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev,
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
 	struct list_head *next;
-	struct dentry *p, *q;
+	struct dentry *q;
 
 	spin_lock(&sbi->lookup_lock);
+	spin_lock(&root->d_lock);
 
-	if (prev == NULL) {
-		spin_lock(&root->d_lock);
+	if (prev)
+		next = prev->d_u.d_child.next;
+	else {
 		prev = dget_dlock(root);
 		next = prev->d_subdirs.next;
-		p = prev;
-		goto start;
 	}
 
-	p = prev;
-	spin_lock(&p->d_lock);
-again:
-	next = p->d_u.d_child.next;
-start:
+cont:
 	if (next == &root->d_subdirs) {
-		spin_unlock(&p->d_lock);
+		spin_unlock(&root->d_lock);
 		spin_unlock(&sbi->lookup_lock);
 		dput(prev);
 		return NULL;
@@ -121,15 +108,15 @@ start:
 	q = list_entry(next, struct dentry, d_u.d_child);
 
 	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
-	/* Negative dentry - try next */
-	if (!simple_positive(q)) {
-		spin_unlock(&p->d_lock);
-		p = q;
-		goto again;
+	/* Already gone or negative dentry (under construction) - try next */
+	if (!d_count(q) || !simple_positive(q)) {
+		spin_unlock(&q->d_lock);
+		next = q->d_u.d_child.next;
+		goto cont;
 	}
 	dget_dlock(q);
 	spin_unlock(&q->d_lock);
-	spin_unlock(&p->d_lock);
+	spin_unlock(&root->d_lock);
 	spin_unlock(&sbi->lookup_lock);
 
 	dput(prev);
@@ -186,6 +173,7 @@ again:
 	/* Negative dentry - try next */
 	if (!simple_positive(ret)) {
 		spin_unlock(&p->d_lock);
+		lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
 		p = ret;
 		goto again;
 	}
@@ -279,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 			else
 				ino_count++;
 
-			if (p->d_count > ino_count) {
+			if (d_count(p) > ino_count) {
 				top_ino->last_used = jiffies;
 				dput(p);
 				return 1;
@@ -402,11 +390,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			DPRINTK("checking mountpoint %p %.*s",
 				dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
-			/* Path walk currently on this dentry? */
-			ino_count = atomic_read(&ino->count) + 2;
-			if (dentry->d_count > ino_count)
-				goto next;
-
 			/* Can we umount this guy */
 			if (autofs4_mount_busy(mnt, dentry))
 				goto next;
@@ -419,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			goto next;
 		}
 
+		if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+			DPRINTK("checking symlink %p %.*s",
+				dentry, (int)dentry->d_name.len, dentry->d_name.name);
+			/*
+			 * A symlink can't be "busy" in the usual sense so
+			 * just check last used for expire timeout.
+			 */
+			if (autofs4_can_expire(dentry, timeout, do_now)) {
+				expired = dentry;
+				goto found;
+			}
+			goto next;
+		}
+
 		if (simple_empty(dentry))
 			goto next;
 
@@ -426,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		if (!exp_leaves) {
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 1;
-			if (dentry->d_count > ino_count)
+			if (d_count(dentry) > ino_count)
 				goto next;
 
 			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -440,7 +437,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		} else {
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 1;
-			if (dentry->d_count > ino_count)
+			if (d_count(dentry) > ino_count)
 				goto next;
 
 			expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -556,15 +553,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 
 		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
-		spin_lock(&dentry->d_lock);
-		if (!ret) {
-			if ((IS_ROOT(dentry) ||
-			    (autofs_type_indirect(sbi->type) &&
-			     IS_ROOT(dentry->d_parent))) &&
-			    !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-				__managed_dentry_set_automount(dentry);
-		}
-		spin_unlock(&dentry->d_lock);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index c038727b405..b3db517e89e 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -26,16 +26,17 @@ static struct file_system_type autofs_fs_type = {
 	.mount		= autofs_mount,
 	.kill_sb	= autofs4_kill_sb,
 };
+MODULE_ALIAS_FS("autofs");
 
 static int __init init_autofs4_fs(void)
 {
 	int err;
 
+	autofs_dev_ioctl_init();
+
 	err = register_filesystem(&autofs_fs_type);
 	if (err)
-		return err;
-
-	autofs_dev_ioctl_init();
+		autofs_dev_ioctl_exit();
 
 	return err;
 }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 180fa2425e4..1c55388ae63 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 
 void autofs4_clean_ino(struct autofs_info *ino)
 {
-	ino->uid = 0;
-	ino->gid = 0;
+	ino->uid = GLOBAL_ROOT_UID;
+	ino->gid = GLOBAL_ROOT_GID;
 	ino->last_used = jiffies;
 }
 
@@ -56,34 +56,34 @@ void autofs4_kill_sb(struct super_block *sb)
 	 * just call kill_anon_super when we are called from
 	 * deactivate_super.
 	 */
-	if (!sbi)
-		goto out_kill_sb;
-
-	/* Free wait queues, close pipe */
-	autofs4_catatonic_mode(sbi);
-
-	sb->s_fs_info = NULL;
-	kfree(sbi);
+	if (sbi) {
+		/* Free wait queues, close pipe */
+		autofs4_catatonic_mode(sbi);
+		put_pid(sbi->oz_pgrp);
+	}
 
-out_kill_sb:
 	DPRINTK("shutting down");
 	kill_litter_super(sb);
+	if (sbi)
+		kfree_rcu(sbi, rcu);
 }
 
-static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb);
-	struct inode *root_inode = mnt->mnt_sb->s_root->d_inode;
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct inode *root_inode = root->d_sb->s_root->d_inode;
 
 	if (!sbi)
 		return 0;
 
 	seq_printf(m, ",fd=%d", sbi->pipefd);
-	if (root_inode->i_uid != 0)
-		seq_printf(m, ",uid=%u", root_inode->i_uid);
-	if (root_inode->i_gid != 0)
-		seq_printf(m, ",gid=%u", root_inode->i_gid);
-	seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+	if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			from_kuid_munged(&init_user_ns, root_inode->i_uid));
+	if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			from_kgid_munged(&init_user_ns, root_inode->i_gid));
+	seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
 	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
 	seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -100,7 +100,7 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 static void autofs4_evict_inode(struct inode *inode)
 {
-	end_writeback(inode);
+	clear_inode(inode);
 	kfree(inode->i_private);
 }
 
@@ -126,8 +126,9 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-		pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
+			 int *pgrp, bool *pgrp_set, unsigned int *type,
+			 int *minproto, int *maxproto)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -135,7 +136,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 
 	*uid = current_uid();
 	*gid = current_gid();
-	*pgrp = task_pgrp_nr(current);
 
 	*minproto = AUTOFS_MIN_PROTO_VERSION;
 	*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -159,17 +159,22 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 		case Opt_uid:
 			if (match_int(args, &option))
 				return 1;
-			*uid = option;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 1;
 			break;
 		case Opt_gid:
 			if (match_int(args, &option))
 				return 1;
-			*gid = option;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 1;
 			break;
 		case Opt_pgrp:
 			if (match_int(args, &option))
 				return 1;
 			*pgrp = option;
+			*pgrp_set = true;
 			break;
 		case Opt_minproto:
 			if (match_int(args, &option))
@@ -205,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	int pipefd;
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
+	int pgrp = 0;
+	bool pgrp_set = false;
+	int ret = -EINVAL;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		goto fail_unlock;
+		return -ENOMEM;
 	DPRINTK("starting up, sbi = %p",sbi);
 
 	s->s_fs_info = sbi;
@@ -217,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->pipe = NULL;
 	sbi->catatonic = 1;
 	sbi->exp_timeout = 0;
-	sbi->oz_pgrp = task_pgrp_nr(current);
+	sbi->oz_pgrp = NULL;
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
@@ -225,6 +233,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
+	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
 	sbi->queues = NULL;
 	spin_lock_init(&sbi->lookup_lock);
@@ -241,27 +250,37 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	 * Get the root inode and dentry, but defer checking for errors.
 	 */
 	ino = autofs4_new_ino(sbi);
-	if (!ino)
+	if (!ino) {
+		ret = -ENOMEM;
 		goto fail_free;
+	}
 	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
-	if (!root_inode)
-		goto fail_ino;
-
-	root = d_alloc_root(root_inode);
+	root = d_make_root(root_inode);
 	if (!root)
-		goto fail_iput;
+		goto fail_ino;
 	pipe = NULL;
 
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
 	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
-				&sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
-				&sbi->max_proto)) {
+			  &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
+			  &sbi->max_proto)) {
 		printk("autofs: called with bogus options\n");
 		goto fail_dput;
 	}
 
+	if (pgrp_set) {
+		sbi->oz_pgrp = find_get_pid(pgrp);
+		if (!sbi->oz_pgrp) {
+			pr_warn("autofs: could not find process group %d\n",
+				pgrp);
+			goto fail_dput;
+		}
+	} else {
+		sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+	}
+
 	if (autofs_type_trigger(sbi->type))
 		__managed_dentry_set_managed(root);
 
@@ -285,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		sbi->version = sbi->max_proto;
 	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
 
-	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
 	pipe = fget(pipefd);
-	
+
 	if (!pipe) {
 		printk("autofs: could not open pipe file descriptor\n");
 		goto fail_dput;
 	}
-	if (!pipe->f_op || !pipe->f_op->write)
+	ret = autofs_prepare_pipe(pipe);
+	if (ret < 0)
 		goto fail_fput;
 	sbi->pipe = pipe;
 	sbi->pipefd = pipefd;
@@ -314,19 +334,16 @@ fail_fput:
 fail_dput:
 	dput(root);
 	goto fail_free;
-fail_iput:
-	printk("autofs: get root dentry failed\n");
-	iput(root_inode);
 fail_ino:
 	kfree(ino);
 fail_free:
+	put_pid(sbi->oz_pgrp);
 	kfree(sbi);
 	s->s_fs_info = NULL;
-fail_unlock:
-	return -EINVAL;
+	return ret;
 }
 
-struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
+struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
 {
 	struct inode *inode = new_inode(sb);
 
@@ -342,7 +359,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
 	inode->i_ino = get_next_ino();
 
 	if (S_ISDIR(mode)) {
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		inode->i_op = &autofs4_dir_inode_operations;
 		inode->i_fop = &autofs4_dir_operations;
 	} else if (S_ISLNK(mode)) {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 96804a17bbd..cc87c1abac9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -26,13 +26,13 @@
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
+static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 #ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
+static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
 static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 	.unlocked_ioctl	= autofs4_root_ioctl,
 #ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
 	.open		= autofs4_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 };
 
@@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * it.
 	 */
 	spin_lock(&sbi->lookup_lock);
-	spin_lock(&dentry->d_lock);
-	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-		spin_unlock(&dentry->d_lock);
+	if (!d_mountpoint(dentry) && simple_empty(dentry)) {
 		spin_unlock(&sbi->lookup_lock);
 		return -ENOENT;
 	}
-	spin_unlock(&dentry->d_lock);
 	spin_unlock(&sbi->lookup_lock);
 
 out:
@@ -182,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 		spin_lock(&active->d_lock);
 
 		/* Already gone? */
-		if (active->d_count == 0)
+		if ((int) d_count(active) <= 0)
 			goto next;
 
 		qstr = &active->d_name;
@@ -233,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 
 		spin_lock(&expiring->d_lock);
 
-		/* Bad luck, we've already been dentry_iput */
+		/* We've already been dentry_iput or unlinked */
 		if (!expiring->d_inode)
 			goto next;
 
@@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 		status = autofs4_mount_wait(dentry);
 		if (status)
 			return ERR_PTR(status);
-		spin_lock(&sbi->fs_lock);
 		goto done;
 	}
 
@@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 	 * having d_mountpoint() true, so there's no need to call back
 	 * to the daemon.
 	 */
-	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+	if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+		spin_unlock(&sbi->fs_lock);
 		goto done;
+	}
+
 	if (!d_mountpoint(dentry)) {
 		/*
 		 * It's possible that user space hasn't removed directories
@@ -379,46 +378,28 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 		 * require user space behave.
 		 */
 		if (sbi->version > 4) {
-			if (have_submounts(dentry))
+			if (have_submounts(dentry)) {
+				spin_unlock(&sbi->fs_lock);
 				goto done;
+			}
 		} else {
-			spin_lock(&dentry->d_lock);
-			if (!list_empty(&dentry->d_subdirs)) {
-				spin_unlock(&dentry->d_lock);
+			if (!simple_empty(dentry)) {
+				spin_unlock(&sbi->fs_lock);
 				goto done;
 			}
-			spin_unlock(&dentry->d_lock);
 		}
 		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&sbi->fs_lock);
 		status = autofs4_mount_wait(dentry);
-		if (status)
-			return ERR_PTR(status);
 		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_PENDING;
-	}
-done:
-	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
-		/*
-		 * Any needed mounting has been completed and the path
-		 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
-		 * call ->d_automount() on rootless multi-mounts since
-		 * it can lead to an incorrect ELOOP error return.
-		 *
-		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
-		 * symlinks as in all other cases the dentry will be covered by
-		 * an actual mount so ->d_automount() won't be called during
-		 * the follow.
-		 */
-		spin_lock(&dentry->d_lock);
-		if ((!d_mountpoint(dentry) &&
-		    !list_empty(&dentry->d_subdirs)) ||
-		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
-			__managed_dentry_clear_automount(dentry);
-		spin_unlock(&dentry->d_lock);
+		if (status) {
+			spin_unlock(&sbi->fs_lock);
+			return ERR_PTR(status);
+		}
 	}
 	spin_unlock(&sbi->fs_lock);
-
+done:
 	/* Mount succeeded, check if we ended up with a new dentry */
 	dentry = autofs4_mountpoint_changed(path);
 	if (!dentry)
@@ -427,9 +408,11 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
 
 	DPRINTK("dentry=%p %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
@@ -454,11 +437,36 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 	 * This dentry may be under construction so wait on mount
 	 * completion.
 	 */
-	return autofs4_mount_wait(dentry);
+	status = autofs4_mount_wait(dentry);
+	if (status)
+		return status;
+
+	spin_lock(&sbi->fs_lock);
+	/*
+	 * If the dentry has been selected for expire while we slept
+	 * on the lock then it might go away. We'll deal with that in
+	 * ->d_automount() and wait on a new mount if the expire
+	 * succeeds or return here if it doesn't (since there's no
+	 * mount to follow with a rootless multi-mount).
+	 */
+	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+		/*
+		 * Any needed mounting has been completed and the path
+		 * updated so check if this is a rootless multi-mount so
+		 * we can avoid needless calls ->d_automount() and avoid
+		 * an incorrect ELOOP error return.
+		 */
+		if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+			status = -EISDIR;
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	return status;
 }
 
 /* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
@@ -550,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
-	if (p_ino && dentry->d_parent != dentry)
+	if (p_ino && !IS_ROOT(dentry))
 		atomic_inc(&p_ino->count);
 
 	dir->i_mtime = CURRENT_TIME;
@@ -581,11 +589,11 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	
 	/* This allows root to remove symlinks */
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
-		if (p_ino && dentry->d_parent != dentry)
+		if (p_ino && !IS_ROOT(dentry))
 			atomic_dec(&p_ino->count);
 	}
 	dput(ino->dentry);
@@ -597,9 +605,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 	spin_lock(&sbi->lookup_lock);
 	__autofs4_add_expiring(dentry);
-	spin_lock(&dentry->d_lock);
-	__d_drop(dentry);
-	spin_unlock(&dentry->d_lock);
+	d_drop(dentry);
 	spin_unlock(&sbi->lookup_lock);
 
 	return 0;
@@ -612,7 +618,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
  * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
  * of the directory tree. There is no need to clear the automount flag
  * following a mount or restore it after an expire because these mounts
- * are always covered. However, it is neccessary to ensure that these
+ * are always covered. However, it is necessary to ensure that these
  * flags are clear on non-empty directories to avoid unnecessary calls
  * during path walks.
  */
@@ -670,15 +676,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EACCES;
 
 	spin_lock(&sbi->lookup_lock);
-	spin_lock(&dentry->d_lock);
-	if (!list_empty(&dentry->d_subdirs)) {
-		spin_unlock(&dentry->d_lock);
+	if (!simple_empty(dentry)) {
 		spin_unlock(&sbi->lookup_lock);
 		return -ENOTEMPTY;
 	}
 	__autofs4_add_expiring(dentry);
-	__d_drop(dentry);
-	spin_unlock(&dentry->d_lock);
+	d_drop(dentry);
 	spin_unlock(&sbi->lookup_lock);
 
 	if (sbi->version < 5)
@@ -699,7 +702,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -729,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	dget(dentry);
 	atomic_inc(&ino->count);
 	p_ino = autofs4_dentry_ino(dentry->d_parent);
-	if (p_ino && dentry->d_parent != dentry)
+	if (p_ino && !IS_ROOT(dentry))
 		atomic_inc(&p_ino->count);
 	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
@@ -873,7 +876,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 static long autofs4_root_ioctl(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 }
 
@@ -881,7 +884,7 @@ static long autofs4_root_ioctl(struct file *filp,
 static long autofs4_root_compat_ioctl(struct file *filp,
 			     unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int ret;
 
 	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index f27c094a191..1e8ea192be2 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,6 +14,10 @@
 
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino && !autofs4_oz_mode(sbi))
+		ino->last_used = jiffies;
 	nd_set_link(nd, dentry->d_inode->i_private);
 	return NULL;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 25435987d6a..116fd38ee47 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	while (wq) {
 		nwq = wq->next;
 		wq->status = -ENOENT; /* Magic is gone - report failure */
-		if (wq->name.name) {
-			kfree(wq->name.name);
-			wq->name.name = NULL;
-		}
+		kfree(wq->name.name);
+		wq->name.name = NULL;
 		wq->wait_ctr--;
 		wake_up_interruptible(&wq->queue);
 		wq = nwq;
@@ -56,26 +54,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	mutex_unlock(&sbi->wq_mutex);
 }
 
-static int autofs4_write(struct file *file, const void *addr, int bytes)
+static int autofs4_write(struct autofs_sb_info *sbi,
+			 struct file *file, const void *addr, int bytes)
 {
 	unsigned long sigpipe, flags;
 	mm_segment_t fs;
 	const char *data = (const char *)addr;
 	ssize_t wr = 0;
 
-	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
 	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
 
 	/* Save pointer to user space and point back to kernel space */
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
+	mutex_lock(&sbi->pipe_mutex);
 	while (bytes &&
 	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
 		data += wr;
 		bytes -= wr;
 	}
+	mutex_unlock(&sbi->pipe_mutex);
 
 	set_fs(fs);
 
@@ -104,12 +103,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	size_t pktsz;
 
 	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-		wq->wait_queue_token, wq->name.len, wq->name.name, type);
+		(unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
 
 	memset(&pkt,0,sizeof pkt); /* For security reasons */
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
+
 	switch (type) {
 	/* Kernel protocol v4 missing and expire packets */
 	case autofs_ptype_missing:
@@ -146,6 +146,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	case autofs_ptype_expire_direct:
 	{
 		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+		struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
 
 		pktsz = sizeof(*packet);
 
@@ -155,30 +156,25 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		packet->name[wq->name.len] = '\0';
 		packet->dev = wq->dev;
 		packet->ino = wq->ino;
-		packet->uid = wq->uid;
-		packet->gid = wq->gid;
+		packet->uid = from_kuid_munged(user_ns, wq->uid);
+		packet->gid = from_kgid_munged(user_ns, wq->gid);
 		packet->pid = wq->pid;
 		packet->tgid = wq->tgid;
 		break;
 	}
 	default:
 		printk("autofs4_notify_daemon: bad type %d!\n", type);
+		mutex_unlock(&sbi->wq_mutex);
 		return;
 	}
 
-	/* Check if we have become catatonic */
-	mutex_lock(&sbi->wq_mutex);
-	if (!sbi->catatonic) {
-		pipe = sbi->pipe;
-		get_file(pipe);
-	}
+	pipe = get_file(sbi->pipe);
+
 	mutex_unlock(&sbi->wq_mutex);
 
-	if (pipe) {
-		if (autofs4_write(pipe, &pkt, pktsz))
-			autofs4_catatonic_mode(sbi);
-		fput(pipe);
-	}
+	if (autofs4_write(sbi, pipe, &pkt, pktsz))
+		autofs4_catatonic_mode(sbi);
+	fput(pipe);
 }
 
 static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -257,6 +253,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 	struct autofs_wait_queue *wq;
 	struct autofs_info *ino;
 
+	if (sbi->catatonic)
+		return -ENOENT;
+
 	/* Wait in progress, continue; */
 	wq = autofs4_find_wait(sbi, qstr);
 	if (wq) {
@@ -289,6 +288,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 			if (mutex_lock_interruptible(&sbi->wq_mutex))
 				return -EINTR;
 
+			if (sbi->catatonic)
+				return -ENOENT;
+
 			wq = autofs4_find_wait(sbi, qstr);
 			if (wq) {
 				*wait = wq;
@@ -345,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	struct qstr qstr;
 	char *name;
 	int status, ret, type;
+	pid_t pid;
+	pid_t tgid;
 
 	/* In catatonic mode, we don't wait for nobody */
 	if (sbi->catatonic)
 		return -ENOENT;
 
+	/*
+	 * Try translating pids to the namespace of the daemon.
+	 *
+	 * Zero means failure: we are in an unrelated pid namespace.
+	 */
+	pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+	tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+	if (pid == 0 || tgid == 0)
+		return -ENOENT;
+
 	if (!dentry->d_inode) {
 		/*
 		 * A wait for a negative dentry is invalid for certain
@@ -389,7 +403,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
 	if (ret <= 0) {
-		if (ret == 0)
+		if (ret != -EINTR)
 			mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
 		return ret;
@@ -415,11 +429,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->ino = autofs4_get_ino(sbi);
 		wq->uid = current_uid();
 		wq->gid = current_gid();
-		wq->pid = current->pid;
-		wq->tgid = current->tgid;
+		wq->pid = pid;
+		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		wq->wait_ctr = 2;
-		mutex_unlock(&sbi->wq_mutex);
 
 		if (sbi->version < 5) {
 			if (notify == NFY_MOUNT)
@@ -441,15 +454,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
 
-		/* autofs4_notify_daemon() may block */
+		/* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
 		wq->wait_ctr++;
-		mutex_unlock(&sbi->wq_mutex);
-		kfree(qstr.name);
 		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
+		mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
 	}
 
 	/*
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9ad2369d9e3..7c93953030f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -9,14 +9,14 @@
  */
 
 #include <linux/fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 
 
-static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	return -EIO;
 }
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return -EIO;
 }
 
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
 {
 	return -EIO;
 }
@@ -87,7 +87,8 @@ static int bad_file_release(struct inode *inode, struct file *filp)
 	return -EIO;
 }
 
-static int bad_file_fsync(struct file *file, int datasync)
+static int bad_file_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
 {
 	return -EIO;
 }
@@ -151,7 +152,7 @@ static const struct file_operations bad_file_ops =
 	.write		= bad_file_write,
 	.aio_read	= bad_file_aio_read,
 	.aio_write	= bad_file_aio_write,
-	.readdir	= bad_file_readdir,
+	.iterate	= bad_file_readdir,
 	.poll		= bad_file_poll,
 	.unlocked_ioctl	= bad_file_unlocked_ioctl,
 	.compat_ioctl	= bad_file_compat_ioctl,
@@ -172,13 +173,13 @@ static const struct file_operations bad_file_ops =
 };
 
 static int bad_inode_create (struct inode *dir, struct dentry *dentry,
-		int mode, struct nameidata *nd)
+		umode_t mode, bool excl)
 {
 	return -EIO;
 }
 
 static struct dentry *bad_inode_lookup(struct inode *dir,
-			struct dentry *dentry, struct nameidata *nd)
+			struct dentry *dentry, unsigned int flags)
 {
 	return ERR_PTR(-EIO);
 }
@@ -201,7 +202,7 @@ static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
 }
 
 static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
-			int mode)
+			umode_t mode)
 {
 	return -EIO;
 }
@@ -212,7 +213,7 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
 }
 
 static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
-			int mode, dev_t rdev)
+			umode_t mode, dev_t rdev)
 {
 	return -EIO;
 }
@@ -229,11 +230,8 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
+static int bad_inode_permission(struct inode *inode, int mask)
 {
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
 	return -EIO;
 }
 
@@ -294,7 +292,6 @@ static const struct inode_operations bad_inode_ops =
 	.getxattr	= bad_inode_getxattr,
 	.listxattr	= bad_inode_listxattr,
 	.removexattr	= bad_inode_removexattr,
-	/* truncate_range returns void */
 };
 
 
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index ce8c787916b..75a461cfaca 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -24,7 +24,7 @@ Version 0.9 (2002-03-14)
 
 Version 0.64 (2002-02-07)
 ==========
-* Did the string comparision really right this time (btree.c) [WD]
+* Did the string comparison really right this time (btree.c) [WD]
 
 * Fixed up some places where I assumed that a long int could hold
 	a pointer value. (btree.c) [WD]
@@ -114,7 +114,7 @@ Version 0.6 (2001-12-15)
 	More flexible. Will soon be controllable at mount time 
 	(see TODO). [WD]
 
-* Rewrote datastream positon lookups.
+* Rewrote datastream position lookups.
 	(datastream.c) [WD]
 
 * Moved the TODO list to its own file.
@@ -150,7 +150,7 @@ Version 0.50 (2001-11-13)
 * Anton also told me that the blocksize is not allowed to be larger than 
 	the page size in linux, which is 4k i386. Oops. Added a test for 
 	(blocksize > PAGE_SIZE), and refuse to mount in that case. What this 
-	practicaly means is that 8k blocksize volumes won't work without a major
+	practically means is that 8k blocksize volumes won't work without a major
 	restructuring of the driver (or an alpha or other 64bit hardware). [WD]
 
 * Cleaned up the befs_count_blocks() function. Much smarter now. 
@@ -183,7 +183,7 @@ Version 0.45 (2001-10-29)
 	structures into the generic pointer fields of the public structures 
 	with kmalloc(). put_super and put_inode free them. This allows us not 
 	to have to touch the definitions of the public structures in 
-	include/linux/fs.h. Also, befs_inode_info is huge (becuase of the 
+	include/linux/fs.h. Also, befs_inode_info is huge (because of the 
 	symlink string). (super.c, inode.c, befs_fs.h) [WD]
 
 * Fixed a thinko that was corrupting file reads after the first block_run 
@@ -404,7 +404,7 @@ Version 0.4 (2001-10-28)
 
 * Fixed compile errors on 2.4.1 kernel (WD)
 	Resolve rejected patches
-	Accomodate changed NLS interface (util.h)
+	Accommodate changed NLS interface (util.h)
 	Needed to include <linux/slab.h> in most files
 	Makefile changes
 	fs/Config.in changes
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 7835d30f211..edc5cc2aefa 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -1,6 +1,6 @@
 config BEFS_FS
-	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "BeOS file system (BeFS) support (read only)"
+	depends on BLOCK
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 2f370bd7a50..8b9f66642a8 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile
@@ -3,5 +3,5 @@
 #
  
 obj-$(CONFIG_BEFS_FS) += befs.o
-
+ccflags-$(CONFIG_BEFS_DEBUG)    += -DDEBUG
 befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index d9a40abda6b..3a7813ab8c9 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -20,8 +20,8 @@ typedef u64 befs_blocknr_t;
  */
 
 typedef struct befs_mount_options {
-	gid_t gid;
-	uid_t uid;
+	kgid_t gid;
+	kuid_t uid;
 	int use_gid;
 	int use_uid;
 	int debug;
@@ -88,8 +88,11 @@ enum befs_err {
 
 /****************************/
 /* debug.c */
+__printf(2, 3)
 void befs_error(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
 void befs_warning(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
 void befs_debug(const struct super_block *sb, const char *fmt, ...);
 
 void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 7893eaa1e58..eb557d9dc8b 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -234,7 +234,7 @@ typedef struct {
 } PACKED befs_btree_super;
 
 /*
- * Header stucture of each btree node
+ * Header structure of each btree node
  */
 typedef struct {
 	fs64 left;
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 4202db7496c..9c7faa8a928 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -5,7 +5,7 @@
  *
  * Licensed under the GNU GPL. See the file COPYING for details.
  *
- * 2002-02-05: Sergey S. Kostyliov added binary search withing
+ * 2002-02-05: Sergey S. Kostyliov added binary search within
  * 		btree nodes.
  *
  * Many thanks to:
@@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
 	struct buffer_head *bh = NULL;
 	befs_disk_btree_super *od_sup = NULL;
 
-	befs_debug(sb, "---> befs_btree_read_super()");
+	befs_debug(sb, "---> %s", __func__);
 
 	bh = befs_read_datastream(sb, ds, 0, NULL);
 
@@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
 		goto error;
 	}
 
-	befs_debug(sb, "<--- befs_btree_read_super()");
+	befs_debug(sb, "<--- %s", __func__);
 	return BEFS_OK;
 
       error:
-	befs_debug(sb, "<--- befs_btree_read_super() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return BEFS_ERR;
 }
 
@@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
 {
 	uint off = 0;
 
-	befs_debug(sb, "---> befs_bt_read_node()");
+	befs_debug(sb, "---> %s", __func__);
 
 	if (node->bh)
 		brelse(node->bh);
 
 	node->bh = befs_read_datastream(sb, ds, node_off, &off);
 	if (!node->bh) {
-		befs_error(sb, "befs_bt_read_node() failed to read "
-			   "node at %Lu", node_off);
-		befs_debug(sb, "<--- befs_bt_read_node() ERROR");
+		befs_error(sb, "%s failed to read "
+			   "node at %llu", __func__, node_off);
+		befs_debug(sb, "<--- %s ERROR", __func__);
 
 		return BEFS_ERR;
 	}
@@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
 	node->head.all_key_length =
 	    fs16_to_cpu(sb, node->od_node->all_key_length);
 
-	befs_debug(sb, "<--- befs_btree_read_node()");
+	befs_debug(sb, "<--- %s", __func__);
 	return BEFS_OK;
 }
 
@@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 	befs_off_t node_off;
 	int res;
 
-	befs_debug(sb, "---> befs_btree_find() Key: %s", key);
+	befs_debug(sb, "---> %s Key: %s", __func__, key);
 
 	if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
 		befs_error(sb,
@@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 	this_node = kmalloc(sizeof (befs_btree_node),
 						GFP_NOFS);
 	if (!this_node) {
-		befs_error(sb, "befs_btree_find() failed to allocate %u "
+		befs_error(sb, "befs_btree_find() failed to allocate %zu "
 			   "bytes of memory", sizeof (befs_btree_node));
 		goto error;
 	}
@@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 	node_off = bt_super.root_node_ptr;
 	if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
 		befs_error(sb, "befs_btree_find() failed to read "
-			   "node at %Lu", node_off);
+			   "node at %llu", node_off);
 		goto error_alloc;
 	}
 
@@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 		/* if no match, go to overflow node */
 		if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
 			befs_error(sb, "befs_btree_find() failed to read "
-				   "node at %Lu", node_off);
+				   "node at %llu", node_off);
 			goto error_alloc;
 		}
 	}
@@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 	kfree(this_node);
 
 	if (res != BEFS_BT_MATCH) {
-		befs_debug(sb, "<--- befs_btree_find() Key %s not found", key);
+		befs_debug(sb, "<--- %s Key %s not found", __func__, key);
 		*value = 0;
 		return BEFS_BT_NOT_FOUND;
 	}
-	befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu",
+	befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
 		   key, *value);
 	return BEFS_OK;
 
@@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 	kfree(this_node);
       error:
 	*value = 0;
-	befs_debug(sb, "<--- befs_btree_find() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return BEFS_ERR;
 }
 
@@ -318,7 +318,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
  * befs_find_key - Search for a key within a node
  * @sb: Filesystem superblock
  * @node: Node to find the key within
- * @key: Keystring to search for
+ * @findkey: Keystring to search for
  * @value: If key is found, the value stored with the key is put here
  *
  * finds exact match if one exists, and returns BEFS_BT_MATCH
@@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
 	char *thiskey;
 	fs64 *valarray;
 
-	befs_debug(sb, "---> befs_find_key() %s", findkey);
+	befs_debug(sb, "---> %s %s", __func__, findkey);
 
 	*value = 0;
 
@@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
 
 	eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
 	if (eq < 0) {
-		befs_debug(sb, "<--- befs_find_key() %s not found", findkey);
+		befs_debug(sb, "<--- %s %s not found", __func__, findkey);
 		return BEFS_BT_NOT_FOUND;
 	}
 
@@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
 					  findkey_len);
 
 		if (eq == 0) {
-			befs_debug(sb, "<--- befs_find_key() found %s at %d",
-				   thiskey, mid);
+			befs_debug(sb, "<--- %s found %s at %d",
+				   __func__, thiskey, mid);
 
 			*value = fs64_to_cpu(sb, valarray[mid]);
 			return BEFS_BT_MATCH;
@@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
 		*value = fs64_to_cpu(sb, valarray[mid + 1]);
 	else
 		*value = fs64_to_cpu(sb, valarray[mid]);
-	befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid);
+	befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
 	return BEFS_BT_PARMATCH;
 }
 
@@ -405,7 +405,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
  * Heres how it works: Key_no is the index of the key/value pair to 
  * return in keybuf/value.
  * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is 
- * the number of charecters in the key (just a convenience).
+ * the number of characters in the key (just a convenience).
  *
  * Algorithm:
  *   Get the first leafnode of the tree. See if the requested key is in that
@@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 
 	uint key_sum = 0;
 
-	befs_debug(sb, "---> befs_btree_read()");
+	befs_debug(sb, "---> %s", __func__);
 
 	if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
 		befs_error(sb,
@@ -436,9 +436,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 		goto error;
 	}
 
-	if ((this_node = (befs_btree_node *)
-	     kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
-		befs_error(sb, "befs_btree_read() failed to allocate %u "
+	if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+		befs_error(sb, "befs_btree_read() failed to allocate %zu "
 			   "bytes of memory", sizeof (befs_btree_node));
 		goto error;
 	}
@@ -453,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 		kfree(this_node);
 		*value = 0;
 		*keysize = 0;
-		befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY");
+		befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
 		return BEFS_BT_EMPTY;
 	} else if (res == BEFS_ERR) {
 		goto error_alloc;
@@ -468,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 			*keysize = 0;
 			*value = 0;
 			befs_debug(sb,
-				   "<--- befs_btree_read() END of keys at %Lu",
+				   "<--- %s END of keys at %llu", __func__,
+				   (unsigned long long)
 				   key_sum + this_node->head.all_key_count);
 			brelse(this_node->bh);
 			kfree(this_node);
@@ -479,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 		node_off = this_node->head.right;
 
 		if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
-			befs_error(sb, "befs_btree_read() failed to read "
-				   "node at %Lu", node_off);
+			befs_error(sb, "%s failed to read node at %llu",
+				  __func__, (unsigned long long)node_off);
 			goto error_alloc;
 		}
 	}
@@ -493,27 +493,28 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 
 	keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
 
-	befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen);
+	befs_debug(sb, "Read [%llu,%d]: keysize %d",
+		   (long long unsigned int)node_off, (int)cur_key,
+		   (int)keylen);
 
 	if (bufsize < keylen + 1) {
-		befs_error(sb, "befs_btree_read() keybuf too small (%u) "
-			   "for key of size %d", bufsize, keylen);
+		befs_error(sb, "%s keybuf too small (%zu) "
+			   "for key of size %d", __func__, bufsize, keylen);
 		brelse(this_node->bh);
 		goto error_alloc;
-	};
+	}
 
-	strncpy(keybuf, keystart, keylen);
+	strlcpy(keybuf, keystart, keylen + 1);
 	*value = fs64_to_cpu(sb, valarray[cur_key]);
 	*keysize = keylen;
-	keybuf[keylen] = '\0';
 
-	befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off,
+	befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
 		   cur_key, keylen, keybuf, *value);
 
 	brelse(this_node->bh);
 	kfree(this_node);
 
-	befs_debug(sb, "<--- befs_btree_read()");
+	befs_debug(sb, "<--- %s", __func__);
 
 	return BEFS_OK;
 
@@ -523,7 +524,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
       error:
 	*keysize = 0;
 	*value = 0;
-	befs_debug(sb, "<--- befs_btree_read() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return BEFS_ERR;
 }
 
@@ -548,26 +549,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
 		    befs_off_t * node_off)
 {
 
-	befs_debug(sb, "---> befs_btree_seekleaf()");
+	befs_debug(sb, "---> %s", __func__);
 
 	if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
-		befs_error(sb, "befs_btree_seekleaf() failed to read "
-			   "node at %Lu", *node_off);
+		befs_error(sb, "%s failed to read "
+			   "node at %llu", __func__, *node_off);
 		goto error;
 	}
-	befs_debug(sb, "Seekleaf to root node %Lu", *node_off);
+	befs_debug(sb, "Seekleaf to root node %llu", *node_off);
 
 	if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
-		befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY");
+		befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
 		return BEFS_BT_EMPTY;
 	}
 
 	while (!befs_leafnode(this_node)) {
 
 		if (this_node->head.all_key_count == 0) {
-			befs_debug(sb, "befs_btree_seekleaf() encountered "
-				   "an empty interior node: %Lu. Using Overflow "
-				   "node: %Lu", *node_off,
+			befs_debug(sb, "%s encountered "
+				   "an empty interior node: %llu. Using Overflow "
+				   "node: %llu", __func__, *node_off,
 				   this_node->head.overflow);
 			*node_off = this_node->head.overflow;
 		} else {
@@ -575,19 +576,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
 			*node_off = fs64_to_cpu(sb, valarray[0]);
 		}
 		if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
-			befs_error(sb, "befs_btree_seekleaf() failed to read "
-				   "node at %Lu", *node_off);
+			befs_error(sb, "%s failed to read "
+				   "node at %llu", __func__, *node_off);
 			goto error;
 		}
 
-		befs_debug(sb, "Seekleaf to child node %Lu", *node_off);
+		befs_debug(sb, "Seekleaf to child node %llu", *node_off);
 	}
-	befs_debug(sb, "Node %Lu is a leaf node", *node_off);
+	befs_debug(sb, "Node %llu is a leaf node", *node_off);
 
 	return BEFS_OK;
 
       error:
-	befs_debug(sb, "<--- befs_btree_seekleaf() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return BEFS_ERR;
 }
 
@@ -705,7 +706,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
  * @key1: pointer to the first key to be compared 
  * @keylen1: length in bytes of key1
  * @key2: pointer to the second key to be compared
- * @kelen2: length in bytes of key2
+ * @keylen2: length in bytes of key2
  *
  * Returns 0 if @key1 and @key2 are equal.
  * Returns >0 if @key1 is greater.
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 59096b5e0fc..1e8e0b8d883 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
 	befs_block_run run;
 	befs_blocknr_t block;	/* block coresponding to pos */
 
-	befs_debug(sb, "---> befs_read_datastream() %Lu", pos);
+	befs_debug(sb, "---> %s %llu", __func__, pos);
 	block = pos >> BEFS_SB(sb)->block_shift;
 	if (off)
 		*off = pos - (block << BEFS_SB(sb)->block_shift);
 
 	if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
 		befs_error(sb, "BeFS: Error finding disk addr of block %lu",
-			   block);
-		befs_debug(sb, "<--- befs_read_datastream() ERROR");
+			   (unsigned long)block);
+		befs_debug(sb, "<--- %s ERROR", __func__);
 		return NULL;
 	}
 	bh = befs_bread_iaddr(sb, run);
 	if (!bh) {
 		befs_error(sb, "BeFS: Error reading block %lu from datastream",
-			   block);
+			   (unsigned long)block);
 		return NULL;
 	}
 
-	befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu",
-		   pos);
+	befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
 
 	return bh;
 }
@@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
 	} else {
 		befs_error(sb,
 			   "befs_fblock2brun() was asked to find block %lu, "
-			   "which is not mapped by the datastream\n", fblock);
+			   "which is not mapped by the datastream\n",
+			   (unsigned long)fblock);
 		err = BEFS_ERR;
 	}
 	return err;
@@ -116,7 +116,7 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
  * befs_read_lsmylink - read long symlink from datastream.
  * @sb: Filesystem superblock 
  * @ds: Datastrem to read from
- * @buf: Buffer in which to place long symlink data
+ * @buff: Buffer in which to place long symlink data
  * @len: Length of the long symlink in bytes
  *
  * Returns the number of bytes read
@@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
 	befs_off_t bytes_read = 0;	/* bytes readed */
 	u16 plen;
 	struct buffer_head *bh = NULL;
-	befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len);
+	befs_debug(sb, "---> %s length: %llu", __func__, len);
 
 	while (bytes_read < len) {
 		bh = befs_read_datastream(sb, ds, bytes_read, NULL);
 		if (!bh) {
 			befs_error(sb, "BeFS: Error reading datastream block "
-				   "starting from %Lu", bytes_read);
-			befs_debug(sb, "<--- befs_read_lsymlink() ERROR");
+				   "starting from %llu", bytes_read);
+			befs_debug(sb, "<--- %s ERROR", __func__);
 			return bytes_read;
 
 		}
@@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
 		bytes_read += plen;
 	}
 
-	befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read);
+	befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
+		   bytes_read);
 	return bytes_read;
 }
 
@@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
 	befs_blocknr_t metablocks;	/* FS metadata blocks */
 	befs_sb_info *befs_sb = BEFS_SB(sb);
 
-	befs_debug(sb, "---> befs_count_blocks()");
+	befs_debug(sb, "---> %s", __func__);
 
 	datablocks = ds->size >> befs_sb->block_shift;
 	if (ds->size & (befs_sb->block_size - 1))
@@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
 	}
 
 	blocks = datablocks + metablocks;
-	befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks);
+	befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
 
 	return blocks;
 }
@@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
 	befs_blocknr_t max_block =
 	    data->max_direct_range >> BEFS_SB(sb)->block_shift;
 
-	befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno);
+	befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
 
 	if (blockno > max_block) {
-		befs_error(sb, "befs_find_brun_direct() passed block outside of"
-			   "direct region");
+		befs_error(sb, "%s passed block outside of direct region",
+			   __func__);
 		return BEFS_ERR;
 	}
 
@@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
 			run->start = array[i].start + offset;
 			run->len = array[i].len - offset;
 
-			befs_debug(sb, "---> befs_find_brun_direct(), "
-				   "found %lu at direct[%d]", blockno, i);
+			befs_debug(sb, "---> %s, "
+				   "found %lu at direct[%d]", __func__,
+				   (unsigned long)blockno, i);
 			return BEFS_OK;
 		}
 	}
 
-	befs_debug(sb, "---> befs_find_brun_direct() ERROR");
+	befs_debug(sb, "---> %s ERROR", __func__);
 	return BEFS_ERR;
 }
 
@@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb,
 	befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
 	int arraylen = befs_iaddrs_per_block(sb);
 
-	befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno);
+	befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
 
 	indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
 	search_blk = blockno - indir_start_blk;
@@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb,
 	for (i = 0; i < indirect.len; i++) {
 		indirblock = befs_bread(sb, indirblockno + i);
 		if (indirblock == NULL) {
-			befs_debug(sb,
-				   "---> befs_find_brun_indirect() failed to "
-				   "read disk block %lu from the indirect brun",
-				   indirblockno + i);
+			befs_debug(sb, "---> %s failed to read "
+				   "disk block %lu from the indirect brun",
+				   __func__, (unsigned long)indirblockno + i);
 			return BEFS_ERR;
 		}
 
@@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb,
 
 				brelse(indirblock);
 				befs_debug(sb,
-					   "<--- befs_find_brun_indirect() found "
-					   "file block %lu at indirect[%d]",
-					   blockno, j + (i * arraylen));
+					   "<--- %s found file block "
+					   "%lu at indirect[%d]", __func__,
+					   (unsigned long)blockno,
+					   j + (i * arraylen));
 				return BEFS_OK;
 			}
 			sum += len;
@@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb,
 	}
 
 	/* Only fallthrough is an error */
-	befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find "
-		   "file block %lu", blockno);
+	befs_error(sb, "BeFS: %s failed to find "
+		   "file block %lu", __func__, (unsigned long)blockno);
 
-	befs_debug(sb, "<--- befs_find_brun_indirect() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return BEFS_ERR;
 }
 
@@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
 	    * BEFS_DBLINDIR_BRUN_LEN;
 
-	befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno);
+	befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
 
 	/* First, discover which of the double_indir->indir blocks
 	 * contains pos. Then figure out how much of pos that
@@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
 	if (dbl_which_block > data->double_indirect.len) {
 		befs_error(sb, "The double-indirect index calculated by "
-			   "befs_read_brun_dblindirect(), %d, is outside the range "
-			   "of the double-indirect block", dblindir_indx);
+			   "%s, %d, is outside the range "
+			   "of the double-indirect block", __func__,
+			   dblindir_indx);
 		return BEFS_ERR;
 	}
 
@@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	    befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
 					dbl_which_block);
 	if (dbl_indir_block == NULL) {
-		befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
-			   "double-indirect block at blockno %lu",
-			   iaddr2blockno(sb,
-					 &data->double_indirect) +
+		befs_error(sb, "%s couldn't read the "
+			   "double-indirect block at blockno %lu", __func__,
+			   (unsigned long)
+			   iaddr2blockno(sb, &data->double_indirect) +
 			   dbl_which_block);
 		brelse(dbl_indir_block);
 		return BEFS_ERR;
@@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	which_block = indir_indx / befs_iaddrs_per_block(sb);
 	if (which_block > indir_run.len) {
 		befs_error(sb, "The indirect index calculated by "
-			   "befs_read_brun_dblindirect(), %d, is outside the range "
-			   "of the indirect block", indir_indx);
+			   "%s, %d, is outside the range "
+			   "of the indirect block", __func__, indir_indx);
 		return BEFS_ERR;
 	}
 
 	indir_block =
 	    befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
 	if (indir_block == NULL) {
-		befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
-			   "indirect block at blockno %lu",
+		befs_error(sb, "%s couldn't read the indirect block "
+			   "at blockno %lu", __func__, (unsigned long)
 			   iaddr2blockno(sb, &indir_run) + which_block);
 		brelse(indir_block);
 		return BEFS_ERR;
@@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	run->len -= offset;
 
 	befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
-		   " double_indirect_leftover = %lu",
+		   " double_indirect_leftover = %lu", (unsigned long)
 		   blockno, dblindir_indx, indir_indx, dblindir_leftover);
 
 	return BEFS_OK;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 622e73775c8..4de7cffcd66 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -10,6 +10,7 @@
  * debug functions
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #ifdef __KERNEL__
 
 #include <stdarg.h>
@@ -23,43 +24,30 @@
 
 #include "befs.h"
 
-#define ERRBUFSIZE 1024
-
 void
 befs_error(const struct super_block *sb, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
-	char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
-	if (err_buf == NULL) {
-		printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
-		return;
-	}
 
 	va_start(args, fmt);
-	vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_err("(%s): %pV\n", sb->s_id, &vaf);
 	va_end(args);
-
-	printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
-	kfree(err_buf);
 }
 
 void
 befs_warning(const struct super_block *sb, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
-	char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
-	if (err_buf == NULL) {
-		printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
-		return;
-	}
 
 	va_start(args, fmt);
-	vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_warn("(%s): %pV\n", sb->s_id, &vaf);
 	va_end(args);
-
-	printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
-
-	kfree(err_buf);
 }
 
 void
@@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
+	struct va_format vaf;
 	va_list args;
-	char *err_buf = NULL;
-
-	if (BEFS_SB(sb)->mount_opts.debug) {
-		err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
-		if (err_buf == NULL) {
-			printk(KERN_ERR "could not allocate %d bytes\n",
-				ERRBUFSIZE);
-			return;
-		}
-
-		va_start(args, fmt);
-		vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
-		va_end(args);
-
-		printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
-
-		kfree(err_buf);
-	}
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_debug("(%s): %pV\n", sb->s_id, &vaf);
+	va_end(args);
 
 #endif				//CONFIG_BEFS_DEBUG
 }
@@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
 	befs_debug(sb, "  gid %u", fs32_to_cpu(sb, inode->gid));
 	befs_debug(sb, "  mode %08x", fs32_to_cpu(sb, inode->mode));
 	befs_debug(sb, "  flags %08x", fs32_to_cpu(sb, inode->flags));
-	befs_debug(sb, "  create_time %Lu",
+	befs_debug(sb, "  create_time %llu",
 		   fs64_to_cpu(sb, inode->create_time));
-	befs_debug(sb, "  last_modified_time %Lu",
+	befs_debug(sb, "  last_modified_time %llu",
 		   fs64_to_cpu(sb, inode->last_modified_time));
 
 	tmp_run = fsrun_to_cpu(sb, inode->parent);
@@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
 				   tmp_run.allocation_group, tmp_run.start,
 				   tmp_run.len);
 		}
-		befs_debug(sb, "  max_direct_range %Lu",
+		befs_debug(sb, "  max_direct_range %llu",
 			   fs64_to_cpu(sb,
 				       inode->data.datastream.
 				       max_direct_range));
@@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
 			   tmp_run.allocation_group,
 			   tmp_run.start, tmp_run.len);
 
-		befs_debug(sb, "  max_indirect_range %Lu",
+		befs_debug(sb, "  max_indirect_range %llu",
 			   fs64_to_cpu(sb,
 				       inode->data.datastream.
 				       max_indirect_range));
@@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
 			   tmp_run.allocation_group, tmp_run.start,
 			   tmp_run.len);
 
-		befs_debug(sb, "  max_double_indirect_range %Lu",
+		befs_debug(sb, "  max_double_indirect_range %llu",
 			   fs64_to_cpu(sb,
 				       inode->data.datastream.
 				       max_double_indirect_range));
 
-		befs_debug(sb, "  size %Lu",
+		befs_debug(sb, "  size %llu",
 			   fs64_to_cpu(sb, inode->data.datastream.size));
 	}
 
@@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
 	befs_debug(sb, "  block_size %u", fs32_to_cpu(sb, sup->block_size));
 	befs_debug(sb, "  block_shift %u", fs32_to_cpu(sb, sup->block_shift));
 
-	befs_debug(sb, "  num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks));
-	befs_debug(sb, "  used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks));
+	befs_debug(sb, "  num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
+	befs_debug(sb, "  used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
 
 	befs_debug(sb, "  magic2 %08x", fs32_to_cpu(sb, sup->magic2));
 	befs_debug(sb, "  blocks_per_ag %u",
@@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
 	befs_debug(sb, "  log_blocks %u, %hu, %hu",
 		   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
 
-	befs_debug(sb, "  log_start %Ld", fs64_to_cpu(sb, sup->log_start));
-	befs_debug(sb, "  log_end %Ld", fs64_to_cpu(sb, sup->log_end));
+	befs_debug(sb, "  log_start %lld", fs64_to_cpu(sb, sup->log_start));
+	befs_debug(sb, "  log_end %lld", fs64_to_cpu(sb, sup->log_end));
 
 	befs_debug(sb, "  magic3 %08x", fs32_to_cpu(sb, sup->magic3));
 
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 94c17f9a957..fa4b718de59 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
 	/* check magic header. */
 	if (magic1 != BEFS_INODE_MAGIC1) {
 		befs_error(sb,
-			   "Inode has a bad magic header - inode = %lu", inode);
+			   "Inode has a bad magic header - inode = %lu",
+			   (unsigned long)inode);
 		return BEFS_BAD_INODE;
 	}
 
@@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
 	 */
 	if (inode != iaddr2blockno(sb, &ino_num)) {
 		befs_error(sb, "inode blocknr field disagrees with vfs "
-			   "VFS: %lu, Inode %lu",
-			   inode, iaddr2blockno(sb, &ino_num));
+			   "VFS: %lu, Inode %lu", (unsigned long)
+			   inode, (unsigned long)iaddr2blockno(sb, &ino_num));
 		return BEFS_BAD_INODE;
 	}
 
@@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
 	 */
 
 	if (!(flags & BEFS_INODE_IN_USE)) {
-		befs_error(sb, "inode is not used - inode = %lu", inode);
+		befs_error(sb, "inode is not used - inode = %lu",
+			   (unsigned long)inode);
 		return BEFS_BAD_INODE;
 	}
 
diff --git a/fs/befs/io.c b/fs/befs/io.c
index ddef98aa255..0408a3d601d 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
 	befs_blocknr_t block = 0;
 	befs_sb_info *befs_sb = BEFS_SB(sb);
 
-	befs_debug(sb, "---> Enter befs_read_iaddr() "
-		   "[%u, %hu, %hu]",
-		   iaddr.allocation_group, iaddr.start, iaddr.len);
+	befs_debug(sb, "---> Enter %s "
+		   "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
+		   iaddr.start, iaddr.len);
 
 	if (iaddr.allocation_group > befs_sb->num_ags) {
 		befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
@@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
 
 	block = iaddr2blockno(sb, &iaddr);
 
-	befs_debug(sb, "befs_read_iaddr: offset = %lu", block);
+	befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
 
 	bh = sb_bread(sb, block);
 
 	if (bh == NULL) {
-		befs_error(sb, "Failed to read block %lu", block);
+		befs_error(sb, "Failed to read block %lu",
+			   (unsigned long)block);
 		goto error;
 	}
 
-	befs_debug(sb, "<--- befs_read_iaddr()");
+	befs_debug(sb, "<--- %s", __func__);
 	return bh;
 
       error:
-	befs_debug(sb, "<--- befs_read_iaddr() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return NULL;
 }
 
@@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block)
 {
 	struct buffer_head *bh = NULL;
 
-	befs_debug(sb, "---> Enter befs_read() %Lu", block);
+	befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
 
 	bh = sb_bread(sb, block);
 
 	if (bh == NULL) {
-		befs_error(sb, "Failed to read block %lu", block);
+		befs_error(sb, "Failed to read block %lu",
+			   (unsigned long)block);
 		goto error;
 	}
 
-	befs_debug(sb, "<--- befs_read()");
+	befs_debug(sb, "<--- %s", __func__);
 
 	return bh;
 
       error:
-	befs_debug(sb, "<--- befs_read() ERROR");
+	befs_debug(sb, "<--- %s ERROR", __func__);
 	return NULL;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747..a16fbd4e824 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -5,6 +5,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -15,6 +17,7 @@
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/namei.h>
+#include <linux/sched.h>
 
 #include "befs.h"
 #include "btree.h"
@@ -30,18 +33,17 @@ MODULE_LICENSE("GPL");
 /* The units the vfs expects inode->i_blocks to be in */
 #define VFS_BLOCK_SIZE 512
 
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
-static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int);
 static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
-static int befs_init_inodecache(void);
 static void befs_destroy_inodecache(void);
 static void *befs_follow_link(struct dentry *, struct nameidata *);
-static void befs_put_link(struct dentry *, struct nameidata *, void *);
+static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -65,7 +67,7 @@ static struct kmem_cache *befs_inode_cachep;
 
 static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= befs_readdir,
+	.iterate	= befs_readdir,
 	.llseek		= generic_file_llseek,
 };
 
@@ -75,14 +77,18 @@ static const struct inode_operations befs_dir_inode_operations = {
 
 static const struct address_space_operations befs_aops = {
 	.readpage	= befs_readpage,
-	.sync_page	= block_sync_page,
 	.bmap		= befs_bmap,
 };
 
+static const struct inode_operations befs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= befs_fast_follow_link,
+};
+
 static const struct inode_operations befs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= befs_follow_link,
-	.put_link	= befs_put_link,
+	.put_link	= kfree_put_link,
 };
 
 /* 
@@ -126,26 +132,20 @@ befs_get_block(struct inode *inode, sector_t block,
 	ulong disk_off;
 
 	befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
-		   inode->i_ino, block);
-
-	if (block < 0) {
-		befs_error(sb, "befs_get_block() was asked for a block "
-			   "number less than zero: block %ld in inode %lu",
-			   block, inode->i_ino);
-		return -EIO;
-	}
-
+		   (unsigned long)inode->i_ino, (long)block);
 	if (create) {
 		befs_error(sb, "befs_get_block() was asked to write to "
-			   "block %ld in inode %lu", block, inode->i_ino);
+			   "block %ld in inode %lu", (long)block,
+			   (unsigned long)inode->i_ino);
 		return -EPERM;
 	}
 
 	res = befs_fblock2brun(sb, ds, block, &run);
 	if (res != BEFS_OK) {
 		befs_error(sb,
-			   "<--- befs_get_block() for inode %lu, block "
-			   "%ld ERROR", inode->i_ino, block);
+			   "<--- %s for inode %lu, block %ld ERROR",
+			   __func__, (unsigned long)inode->i_ino,
+			   (long)block);
 		return -EFBIG;
 	}
 
@@ -153,14 +153,15 @@ befs_get_block(struct inode *inode, sector_t block,
 
 	map_bh(bh_result, inode->i_sb, disk_off);
 
-	befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, "
-		   "disk address %lu", inode->i_ino, block, disk_off);
+	befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
+		  __func__, (unsigned long)inode->i_ino, (long)block,
+		  (unsigned long)disk_off);
 
 	return 0;
 }
 
 static struct dentry *
-befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct super_block *sb = dir->i_sb;
@@ -171,15 +172,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	char *utfname;
 	const char *name = dentry->d_name.name;
 
-	befs_debug(sb, "---> befs_lookup() "
-		   "name %s inode %ld", dentry->d_name.name, dir->i_ino);
+	befs_debug(sb, "---> %s name %s inode %ld", __func__,
+		   dentry->d_name.name, dir->i_ino);
 
 	/* Convert to UTF-8 */
 	if (BEFS_SB(sb)->nls) {
 		ret =
 		    befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
 		if (ret < 0) {
-			befs_debug(sb, "<--- befs_lookup() ERROR");
+			befs_debug(sb, "<--- %s ERROR", __func__);
 			return ERR_PTR(ret);
 		}
 		ret = befs_btree_find(sb, ds, utfname, &offset);
@@ -190,12 +191,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	}
 
 	if (ret == BEFS_BT_NOT_FOUND) {
-		befs_debug(sb, "<--- befs_lookup() %s not found",
+		befs_debug(sb, "<--- %s %s not found", __func__,
 			   dentry->d_name.name);
 		return ERR_PTR(-ENOENT);
 
 	} else if (ret != BEFS_OK || offset == 0) {
-		befs_warning(sb, "<--- befs_lookup() Error");
+		befs_warning(sb, "<--- %s Error", __func__);
 		return ERR_PTR(-ENODATA);
 	}
 
@@ -205,15 +206,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 
 	d_add(dentry, inode);
 
-	befs_debug(sb, "<--- befs_lookup()");
+	befs_debug(sb, "<--- %s", __func__);
 
 	return NULL;
 }
 
 static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
 	befs_off_t value;
@@ -221,29 +222,27 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	size_t keysize;
 	unsigned char d_type;
 	char keybuf[BEFS_NAME_LEN + 1];
-	char *nlsname;
-	int nlsnamelen;
-	const char *dirname = filp->f_path.dentry->d_name.name;
+	const char *dirname = file->f_path.dentry->d_name.name;
 
-	befs_debug(sb, "---> befs_readdir() "
-		   "name %s, inode %ld, filp->f_pos %Ld",
-		   dirname, inode->i_ino, filp->f_pos);
+	befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
+		  __func__, dirname, inode->i_ino, ctx->pos);
 
-	result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+	result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
 				 keybuf, &keysize, &value);
 
 	if (result == BEFS_ERR) {
-		befs_debug(sb, "<--- befs_readdir() ERROR");
+		befs_debug(sb, "<--- %s ERROR", __func__);
 		befs_error(sb, "IO error reading %s (inode %lu)",
 			   dirname, inode->i_ino);
 		return -EIO;
 
 	} else if (result == BEFS_BT_END) {
-		befs_debug(sb, "<--- befs_readdir() END");
+		befs_debug(sb, "<--- %s END", __func__);
 		return 0;
 
 	} else if (result == BEFS_BT_EMPTY) {
-		befs_debug(sb, "<--- befs_readdir() Empty directory");
+		befs_debug(sb, "<--- %s Empty directory", __func__);
 		return 0;
 	}
 
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	/* Convert to NLS */
 	if (BEFS_SB(sb)->nls) {
+		char *nlsname;
+		int nlsnamelen;
 		result =
 		    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
 		if (result < 0) {
-			befs_debug(sb, "<--- befs_readdir() ERROR");
+			befs_debug(sb, "<--- %s ERROR", __func__);
 			return result;
 		}
-		result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
-				 (ino_t) value, d_type);
+		if (!dir_emit(ctx, nlsname, nlsnamelen,
+				 (ino_t) value, d_type)) {
+			kfree(nlsname);
+			return 0;
+		}
 		kfree(nlsname);
-
 	} else {
-		result = filldir(dirent, keybuf, keysize, filp->f_pos,
-				 (ino_t) value, d_type);
+		if (!dir_emit(ctx, keybuf, keysize,
+				 (ino_t) value, d_type))
+			return 0;
 	}
+	ctx->pos++;
+	goto more;
 
-	filp->f_pos++;
-
-	befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+	befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
 
 	return 0;
 }
@@ -287,7 +291,6 @@ befs_alloc_inode(struct super_block *sb)
 static void befs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
@@ -313,11 +316,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	struct inode *inode;
 	long ret = -EIO;
 
-	befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
+	befs_debug(sb, "---> %s inode = %lu", __func__, ino);
 
 	inode = iget_locked(sb, ino);
-	if (IS_ERR(inode))
-		return inode;
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
@@ -354,11 +357,13 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	 */   
 
 	inode->i_uid = befs_sb->mount_opts.use_uid ?
-	    befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid);
+		befs_sb->mount_opts.uid :
+		make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid));
 	inode->i_gid = befs_sb->mount_opts.use_gid ?
-	    befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
+		befs_sb->mount_opts.gid :
+		make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid));
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 
 	/*
 	 * BEFS's time is 64 bits, but current VFS is 32 bits...
@@ -383,9 +388,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){
 		inode->i_size = 0;
 		inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
-		strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
-			BEFS_SYMLINK_LEN - 1);
-		befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
+		strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
+			BEFS_SYMLINK_LEN);
 	} else {
 		int num_blks;
 
@@ -406,7 +410,10 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &befs_dir_inode_operations;
 		inode->i_fop = &befs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &befs_symlink_inode_operations;
+		if (befs_ino->i_flags & BEFS_LONG_SYMLINK)
+			inode->i_op = &befs_symlink_inode_operations;
+		else
+			inode->i_op = &befs_fast_symlink_inode_operations;
 	} else {
 		befs_error(sb, "Inode %lu is not a regular file, "
 			   "directory or symlink. THAT IS WRONG! BeFS has no "
@@ -415,7 +422,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	}
 
 	brelse(bh);
-	befs_debug(sb, "<--- befs_read_inode()");
+	befs_debug(sb, "<--- %s", __func__);
 	unlock_new_inode(inode);
 	return inode;
 
@@ -424,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 
       unacquire_none:
 	iget_failed(inode);
-	befs_debug(sb, "<--- befs_read_inode() - Bad inode");
+	befs_debug(sb, "<--- %s - Bad inode", __func__);
 	return ERR_PTR(ret);
 }
 
@@ -432,7 +439,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
  *
  * Taken from NFS implementation by Al Viro.
  */
-static int
+static int __init
 befs_init_inodecache(void)
 {
 	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
@@ -441,11 +448,9 @@ befs_init_inodecache(void)
 						SLAB_MEM_SPREAD),
 					      init_once);
 	if (befs_inode_cachep == NULL) {
-		printk(KERN_ERR "befs_init_inodecache: "
-		       "Couldn't initialize inode slabcache\n");
+		pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
 		return -ENOMEM;
 	}
-
 	return 0;
 }
 
@@ -456,6 +461,11 @@ befs_init_inodecache(void)
 static void
 befs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(befs_inode_cachep);
 }
 
@@ -467,14 +477,16 @@ befs_destroy_inodecache(void)
 static void *
 befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	struct super_block *sb = dentry->d_sb;
 	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
+	befs_data_stream *data = &befs_ino->i_data.ds;
+	befs_off_t len = data->size;
 	char *link;
 
-	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-		struct super_block *sb = dentry->d_sb;
-		befs_data_stream *data = &befs_ino->i_data.ds;
-		befs_off_t len = data->size;
-
+	if (len == 0) {
+		befs_error(sb, "Long symlink with illegal length");
+		link = ERR_PTR(-EIO);
+	} else {
 		befs_debug(sb, "Follow long symlink");
 
 		link = kmalloc(len, GFP_NOFS);
@@ -487,22 +499,18 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		} else {
 			link[len - 1] = '\0';
 		}
-	} else {
-		link = befs_ino->i_data.symlink;
 	}
-
 	nd_set_link(nd, link);
 	return NULL;
 }
 
-static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+
+static void *
+befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
-	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-		char *link = nd_get_link(nd);
-		if (!IS_ERR(link))
-			kfree(link);
-	}
+	nd_set_link(nd, befs_ino->i_data.symlink);
+	return NULL;
 }
 
 /*
@@ -528,16 +536,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
 	 */
 	int maxlen = in_len + 1;
 
-	befs_debug(sb, "---> utf2nls()");
+	befs_debug(sb, "---> %s", __func__);
 
 	if (!nls) {
-		befs_error(sb, "befs_utf2nls called with no NLS table loaded");
+		befs_error(sb, "%s called with no NLS table loaded", __func__);
 		return -EINVAL;
 	}
 
 	*out = result = kmalloc(maxlen, GFP_NOFS);
 	if (!*out) {
-		befs_error(sb, "befs_utf2nls() cannot allocate memory");
+		befs_error(sb, "%s cannot allocate memory", __func__);
 		*out_len = 0;
 		return -ENOMEM;
 	}
@@ -559,14 +567,14 @@ befs_utf2nls(struct super_block *sb, const char *in,
 	result[o] = '\0';
 	*out_len = o;
 
-	befs_debug(sb, "<--- utf2nls()");
+	befs_debug(sb, "<--- %s", __func__);
 
 	return o;
 
       conv_err:
 	befs_error(sb, "Name using character set %s contains a character that "
 		   "cannot be converted to unicode.", nls->charset);
-	befs_debug(sb, "<--- utf2nls()");
+	befs_debug(sb, "<--- %s", __func__);
 	kfree(result);
 	return -EILSEQ;
 }
@@ -574,21 +582,21 @@ befs_utf2nls(struct super_block *sb, const char *in,
 /**
  * befs_nls2utf - Convert NLS string to utf8 encodeing
  * @sb: Superblock
- * @src: Input string buffer in NLS format
- * @srclen: Length of input string in bytes
- * @dest: The output string in UTF-8 format
- * @destlen: Length of the output buffer
+ * @in: Input string buffer in NLS format
+ * @in_len: Length of input string in bytes
+ * @out: The output string in UTF-8 format
+ * @out_len: Length of the output buffer
  * 
- * Converts input string @src, which is in the format of the loaded NLS map,
+ * Converts input string @in, which is in the format of the loaded NLS map,
  * into a utf8 string.
  * 
- * The destination string @dest is allocated by this function and the caller is
+ * The destination string @out is allocated by this function and the caller is
  * responsible for freeing it with kfree()
  * 
- * On return, *@destlen is the length of @dest in bytes.
+ * On return, *@out_len is the length of @out in bytes.
  *
  * On success, the return value is the number of utf8 characters written to
- * the output buffer @dest.
+ * the output buffer @out.
  *  
  * On Failure, a negative number coresponding to the error code is returned.
  */
@@ -607,16 +615,17 @@ befs_nls2utf(struct super_block *sb, const char *in,
 	 * in special cases */
 	int maxlen = (3 * in_len) + 1;
 
-	befs_debug(sb, "---> nls2utf()\n");
+	befs_debug(sb, "---> %s\n", __func__);
 
 	if (!nls) {
-		befs_error(sb, "befs_nls2utf called with no NLS table loaded.");
+		befs_error(sb, "%s called with no NLS table loaded.",
+			   __func__);
 		return -EINVAL;
 	}
 
 	*out = result = kmalloc(maxlen, GFP_NOFS);
 	if (!*out) {
-		befs_error(sb, "befs_nls2utf() cannot allocate memory");
+		befs_error(sb, "%s cannot allocate memory", __func__);
 		*out_len = 0;
 		return -ENOMEM;
 	}
@@ -637,14 +646,14 @@ befs_nls2utf(struct super_block *sb, const char *in,
 	result[o] = '\0';
 	*out_len = o;
 
-	befs_debug(sb, "<--- nls2utf()");
+	befs_debug(sb, "<--- %s", __func__);
 
 	return i;
 
       conv_err:
 	befs_error(sb, "Name using charecter set %s contains a charecter that "
 		   "cannot be converted to unicode.", nls->charset);
-	befs_debug(sb, "<--- nls2utf()");
+	befs_debug(sb, "<--- %s", __func__);
 	kfree(result);
 	return -EILSEQ;
 }
@@ -671,10 +680,12 @@ parse_options(char *options, befs_mount_options * opts)
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
+	kuid_t uid;
+	kgid_t gid;
 
 	/* Initialize options */
-	opts->uid = 0;
-	opts->gid = 0;
+	opts->uid = GLOBAL_ROOT_UID;
+	opts->gid = GLOBAL_ROOT_GID;
 	opts->use_uid = 0;
 	opts->use_gid = 0;
 	opts->iocharset = NULL;
@@ -693,31 +704,37 @@ parse_options(char *options, befs_mount_options * opts)
 		case Opt_uid:
 			if (match_int(&args[0], &option))
 				return 0;
-			if (option < 0) {
-				printk(KERN_ERR "BeFS: Invalid uid %d, "
-						"using default\n", option);
+			uid = INVALID_UID;
+			if (option >= 0)
+				uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(uid)) {
+				pr_err("Invalid uid %d, "
+				       "using default\n", option);
 				break;
 			}
-			opts->uid = option;
+			opts->uid = uid;
 			opts->use_uid = 1;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
 				return 0;
-			if (option < 0) {
-				printk(KERN_ERR "BeFS: Invalid gid %d, "
-						"using default\n", option);
+			gid = INVALID_GID;
+			if (option >= 0)
+				gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(gid)) {
+				pr_err("Invalid gid %d, "
+				       "using default\n", option);
 				break;
 			}
-			opts->gid = option;
+			opts->gid = gid;
 			opts->use_gid = 1;
 			break;
 		case Opt_charset:
 			kfree(opts->iocharset);
 			opts->iocharset = match_strdup(&args[0]);
 			if (!opts->iocharset) {
-				printk(KERN_ERR "BeFS: allocation failure for "
-						"iocharset string\n");
+				pr_err("allocation failure for "
+				       "iocharset string\n");
 				return 0;
 			}
 			break;
@@ -725,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts)
 			opts->debug = 1;
 			break;
 		default:
-			printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" "
-					"or missing value\n", p);
+			pr_err("Unrecognized mount option \"%s\" "
+			       "or missing value\n", p);
 			return 0;
 		}
 	}
@@ -735,7 +752,7 @@ parse_options(char *options, befs_mount_options * opts)
 
 /* This function has the responsibiltiy of getting the
  * filesystem ready for unmounting. 
- * Basicly, we free everything that we allocated in
+ * Basically, we free everything that we allocated in
  * befs_read_inode
  */
 static void
@@ -767,22 +784,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 
 	save_mount_options(sb, data);
 
-	sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL);
+	sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
-		printk(KERN_ERR
-		       "BeFS(%s): Unable to allocate memory for private "
+		pr_err("(%s): Unable to allocate memory for private "
 		       "portion of superblock. Bailing.\n", sb->s_id);
 		goto unacquire_none;
 	}
 	befs_sb = BEFS_SB(sb);
-	memset(befs_sb, 0, sizeof(befs_sb_info));
 
 	if (!parse_options((char *) data, &befs_sb->mount_opts)) {
 		befs_error(sb, "cannot parse mount options");
 		goto unacquire_priv_sbp;
 	}
 
-	befs_debug(sb, "---> befs_fill_super()");
+	befs_debug(sb, "---> %s", __func__);
 
 #ifndef CONFIG_BEFS_RW
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -830,7 +845,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 		goto unacquire_priv_sbp;
 
 	if( befs_sb->num_blocks > ~((sector_t)0) ) {
-		befs_error(sb, "blocks count: %Lu "
+		befs_error(sb, "blocks count: %llu "
 			"is larger than the host can use",
 			befs_sb->num_blocks);
 		goto unacquire_priv_sbp;
@@ -849,9 +864,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = PTR_ERR(root);
 		goto unacquire_priv_sbp;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		befs_error(sb, "get root inode failed");
 		goto unacquire_priv_sbp;
 	}
@@ -890,6 +904,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	if (!(*flags & MS_RDONLY))
 		return -EINVAL;
 	return 0;
@@ -901,7 +916,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
-	befs_debug(sb, "---> befs_statfs()");
+	befs_debug(sb, "---> %s", __func__);
 
 	buf->f_type = BEFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
@@ -914,7 +929,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = BEFS_NAME_LEN;
 
-	befs_debug(sb, "<--- befs_statfs()");
+	befs_debug(sb, "<--- %s", __func__);
 
 	return 0;
 }
@@ -933,13 +948,14 @@ static struct file_system_type befs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,	
 };
+MODULE_ALIAS_FS("befs");
 
 static int __init
 init_befs_fs(void)
 {
 	int err;
 
-	printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION);
+	pr_info("version: %s\n", BEFS_VERSION);
 
 	err = befs_init_inodecache();
 	if (err)
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index c2336c62024..3728a6479c6 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -1,6 +1,6 @@
 config BFS_FS
-	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "BFS file system support"
+	depends on BLOCK
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd904..a399e6d9dc7 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,66 +26,59 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
 				const unsigned char *name, int namelen,
 				struct bfs_dirent **res_dir);
 
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
 {
-	struct inode *dir = f->f_path.dentry->d_inode;
+	struct inode *dir = file_inode(f);
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
-	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 	unsigned int offset;
 	int block;
 
-	mutex_lock(&info->bfs_lock);
-
-	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+	if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
 		printf("Bad f_pos=%08lx for %s:%08lx\n",
-					(unsigned long)f->f_pos,
+					(unsigned long)ctx->pos,
 					dir->i_sb->s_id, dir->i_ino);
-		mutex_unlock(&info->bfs_lock);
-		return -EBADF;
+		return -EINVAL;
 	}
 
-	while (f->f_pos < dir->i_size) {
-		offset = f->f_pos & (BFS_BSIZE - 1);
-		block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+	while (ctx->pos < dir->i_size) {
+		offset = ctx->pos & (BFS_BSIZE - 1);
+		block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
 		bh = sb_bread(dir->i_sb, block);
 		if (!bh) {
-			f->f_pos += BFS_BSIZE - offset;
+			ctx->pos += BFS_BSIZE - offset;
 			continue;
 		}
 		do {
 			de = (struct bfs_dirent *)(bh->b_data + offset);
 			if (de->ino) {
 				int size = strnlen(de->name, BFS_NAMELEN);
-				if (filldir(dirent, de->name, size, f->f_pos,
+				if (!dir_emit(ctx, de->name, size,
 						le16_to_cpu(de->ino),
-						DT_UNKNOWN) < 0) {
+						DT_UNKNOWN)) {
 					brelse(bh);
-					mutex_unlock(&info->bfs_lock);
 					return 0;
 				}
 			}
 			offset += BFS_DIRENT_SIZE;
-			f->f_pos += BFS_DIRENT_SIZE;
-		} while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+			ctx->pos += BFS_DIRENT_SIZE;
+		} while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
 		brelse(bh);
 	}
-
-	mutex_unlock(&info->bfs_lock);
-	return 0;	
+	return 0;
 }
 
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= bfs_readdir,
+	.iterate	= bfs_readdir,
 	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };
 
 extern void dump_imap(const char *, struct super_block *);
 
-static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
-						struct nameidata *nd)
+static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						bool excl)
 {
 	int err;
 	struct inode *inode;
@@ -133,7 +126,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 }
 
 static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
-						struct nameidata *nd)
+						unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct buffer_head *bh;
@@ -199,7 +192,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 		printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
 					inode->i_sb->s_id, inode->i_ino,
 					inode->i_nlink);
-		inode->i_nlink = 1;
+		set_nlink(inode, 1);
 	}
 	de->ino = 0;
 	mark_buffer_dirty_inode(bh, dir);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8e..e7f88ace1a2 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -23,10 +23,10 @@
 
 const struct file_operations bfs_file_operations = {
 	.llseek 	= generic_file_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.read		= new_sync_read,
+	.read_iter	= generic_file_read_iter,
+	.write		= new_sync_write,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
 };
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
 	return block_read_full_page(page, bfs_get_block);
 }
 
+static void bfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size)
+		truncate_pagecache(inode, inode->i_size);
+}
+
 static int bfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
 
 	ret = block_write_begin(mapping, pos, len, flags, pagep,
 				bfs_get_block);
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
+	if (unlikely(ret))
+		bfs_write_failed(mapping, pos + len);
 
 	return ret;
 }
@@ -186,7 +191,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations bfs_aops = {
 	.readpage	= bfs_readpage,
 	.writepage	= bfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= bfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= bfs_bmap,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index a8e37f81d09..7041ac35ace 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -40,7 +40,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	int block, off;
 
 	inode = iget_locked(sb, ino);
-	if (IS_ERR(inode))
+	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
@@ -76,9 +76,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
 	BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
 	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
-	inode->i_uid =  le32_to_cpu(di->i_uid);
-	inode->i_gid =  le32_to_cpu(di->i_gid);
-	inode->i_nlink =  le32_to_cpu(di->i_nlink);
+	i_uid_write(inode, le32_to_cpu(di->i_uid));
+	i_gid_write(inode,  le32_to_cpu(di->i_gid));
+	set_nlink(inode, le32_to_cpu(di->i_nlink));
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
 	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
@@ -139,8 +139,8 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	di->i_ino = cpu_to_le16(ino);
 	di->i_mode = cpu_to_le32(inode->i_mode);
-	di->i_uid = cpu_to_le32(inode->i_uid);
-	di->i_gid = cpu_to_le32(inode->i_gid);
+	di->i_uid = cpu_to_le32(i_uid_read(inode));
+	di->i_gid = cpu_to_le32(i_gid_read(inode));
 	di->i_nlink = cpu_to_le32(inode->i_nlink);
 	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -172,9 +172,9 @@ static void bfs_evict_inode(struct inode *inode)
 
 	dprintf("ino=%08lx\n", ino);
 
-	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages_final(&inode->i_data);
 	invalidate_inode_buffers(inode);
-	end_writeback(inode);
+	clear_inode(inode);
 
 	if (inode->i_nlink)
 		return;
@@ -251,7 +251,6 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
 static void bfs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
@@ -267,7 +266,7 @@ static void init_once(void *foo)
 	inode_init_once(&bi->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
@@ -281,6 +280,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(bfs_inode_cachep);
 }
 
@@ -368,9 +372,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		ret = PTR_ERR(inode);
 		goto out2;
 	}
-	s->s_root = d_alloc_root(inode);
+	s->s_root = d_make_root(inode);
 	if (!s->s_root) {
-		iput(inode);
 		ret = -ENOMEM;
 		goto out2;
 	}
@@ -470,6 +473,7 @@ static struct file_system_type bfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("bfs");
 
 static int __init init_bfs_fs(void)
 {
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a6395bdb26a..ca0ba15a730 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -26,40 +26,14 @@
 #include <linux/coredump.h>
 #include <linux/slab.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/a.out-core.h>
 
-static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_binary(struct linux_binprm *);
 static int load_aout_library(struct file*);
-static int aout_core_dump(struct coredump_params *cprm);
-
-static struct linux_binfmt aout_format = {
-	.module		= THIS_MODULE,
-	.load_binary	= load_aout_binary,
-	.load_shlib	= load_aout_library,
-	.core_dump	= aout_core_dump,
-	.min_coredump	= PAGE_SIZE
-};
-
-#define BAD_ADDR(x)	((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
-	start = PAGE_ALIGN(start);
-	end = PAGE_ALIGN(end);
-	if (end > start) {
-		unsigned long addr;
-		down_write(&current->mm->mmap_sem);
-		addr = do_brk(start, end - start);
-		up_write(&current->mm->mmap_sem);
-		if (BAD_ADDR(addr))
-			return addr;
-	}
-	return 0;
-}
 
+#ifdef CONFIG_COREDUMP
 /*
  * Routine writes a core dump image in the current directory.
  * Currently only a stub-function.
@@ -69,10 +43,8 @@ static int set_brk(unsigned long start, unsigned long end)
  * field, which also makes sure the core-dumps won't be recursive if the
  * dumping of the process results in another error..
  */
-
 static int aout_core_dump(struct coredump_params *cprm)
 {
-	struct file *file = cprm->file;
 	mm_segment_t fs;
 	int has_dumped = 0;
 	void __user *dump_start;
@@ -89,10 +61,9 @@ static int aout_core_dump(struct coredump_params *cprm)
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
        	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
 	dump.u_ar0 = offsetof(struct user, regs);
-	dump.signal = cprm->signr;
+	dump.signal = cprm->siginfo->si_signo;
 	aout_dump_thread(cprm->regs, &dump);
 
 /* If the size of the dump file exceeds the rlimit, then see what would happen
@@ -113,10 +84,10 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 	set_fs(KERNEL_DS);
 /* struct user */
-	if (!dump_write(file, &dump, sizeof(dump)))
+	if (!dump_emit(cprm, &dump, sizeof(dump)))
 		goto end_coredump;
 /* Now dump all of the user data.  Include malloced stuff as well */
-	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+	if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
 		goto end_coredump;
 /* now we start writing out the user space info */
 	set_fs(USER_DS);
@@ -124,20 +95,46 @@ static int aout_core_dump(struct coredump_params *cprm)
 	if (dump.u_dsize != 0) {
 		dump_start = START_DATA(dump);
 		dump_size = dump.u_dsize << PAGE_SHIFT;
-		if (!dump_write(file, dump_start, dump_size))
+		if (!dump_emit(cprm, dump_start, dump_size))
 			goto end_coredump;
 	}
 /* Now prepare to dump the stack area */
 	if (dump.u_ssize != 0) {
 		dump_start = START_STACK(dump);
 		dump_size = dump.u_ssize << PAGE_SHIFT;
-		if (!dump_write(file, dump_start, dump_size))
+		if (!dump_emit(cprm, dump_start, dump_size))
 			goto end_coredump;
 	}
 end_coredump:
 	set_fs(fs);
 	return has_dumped;
 }
+#else
+#define aout_core_dump NULL
+#endif
+
+static struct linux_binfmt aout_format = {
+	.module		= THIS_MODULE,
+	.load_binary	= load_aout_binary,
+	.load_shlib	= load_aout_library,
+	.core_dump	= aout_core_dump,
+	.min_coredump	= PAGE_SIZE
+};
+
+#define BAD_ADDR(x)	((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+	start = PAGE_ALIGN(start);
+	end = PAGE_ALIGN(end);
+	if (end > start) {
+		unsigned long addr;
+		addr = vm_brk(start, end - start);
+		if (BAD_ADDR(addr))
+			return addr;
+	}
+	return 0;
+}
 
 /*
  * create_aout_tables() parses the env- and arg-strings in new user
@@ -202,8 +199,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_aout_binary(struct linux_binprm * bprm)
 {
+	struct pt_regs *regs = current_pt_regs();
 	struct exec ex;
 	unsigned long error;
 	unsigned long fd_offset;
@@ -214,7 +212,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
 	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
 	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
-	    i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+	    i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
 		return -ENOEXEC;
 	}
 
@@ -222,7 +220,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * Requires a mmap handler. This prevents people from using a.out
 	 * as part of an exploit attack against /proc-related vulnerabilities.
 	 */
-	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+	if (!bprm->file->f_op->mmap)
 		return -ENOEXEC;
 
 	fd_offset = N_TXTOFF(ex);
@@ -256,11 +254,15 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
+
+	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) {
+		/* Someone check-me: is this error path enough? */
+		send_sig(SIGKILL, current, 0);
+		return retval;
+	}
 
 	install_exec_creds(bprm);
- 	current->flags &= ~PF_FORKNOEXEC;
 
 	if (N_MAGIC(ex) == OMAGIC) {
 		unsigned long text_addr, map_size;
@@ -275,23 +277,18 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		pos = 32;
 		map_size = ex.a_text+ex.a_data;
 #endif
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(text_addr & PAGE_MASK, map_size);
-		up_write(&current->mm->mmap_sem);
+		error = vm_brk(text_addr & PAGE_MASK, map_size);
 		if (error != (text_addr & PAGE_MASK)) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
 
-		error = bprm->file->f_op->read(bprm->file,
-			  (char __user *)text_addr,
-			  ex.a_text+ex.a_data, &pos);
+		error = read_code(bprm->file, text_addr, pos,
+				  ex.a_text+ex.a_data);
 		if ((signed long)error < 0) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
-			 
-		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
 	} else {
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -307,37 +304,26 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		}
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
-			loff_t pos = fd_offset;
-			down_write(&current->mm->mmap_sem);
-			do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			up_write(&current->mm->mmap_sem);
-			bprm->file->f_op->read(bprm->file,
-					(char __user *)N_TXTADDR(ex),
-					ex.a_text+ex.a_data, &pos);
-			flush_icache_range((unsigned long) N_TXTADDR(ex),
-					   (unsigned long) N_TXTADDR(ex) +
-					   ex.a_text+ex.a_data);
+			vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+				  ex.a_text + ex.a_data);
 			goto beyond_if;
 		}
 
-		down_write(&current->mm->mmap_sem);
-		error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+		error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
 			PROT_READ | PROT_EXEC,
 			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
 			fd_offset);
-		up_write(&current->mm->mmap_sem);
 
 		if (error != N_TXTADDR(ex)) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
 
-		down_write(&current->mm->mmap_sem);
- 		error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+		error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
 				PROT_READ | PROT_WRITE | PROT_EXEC,
 				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
 				fd_offset + ex.a_text);
-		up_write(&current->mm->mmap_sem);
 		if (error != N_DATADDR(ex)) {
 			send_sig(SIGKILL, current, 0);
 			return error;
@@ -352,13 +338,6 @@ beyond_if:
 		return retval;
 	}
 
-	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0) { 
-		/* Someone check-me: is this error path enough? */ 
-		send_sig(SIGKILL, current, 0); 
-		return retval;
-	}
-
 	current->mm->start_stack =
 		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
 #ifdef __alpha__
@@ -376,7 +355,7 @@ static int load_aout_library(struct file *file)
 	int retval;
 	struct exec ex;
 
-	inode = file->f_path.dentry->d_inode;
+	inode = file_inode(file);
 
 	retval = -ENOEXEC;
 	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
@@ -394,7 +373,7 @@ static int load_aout_library(struct file *file)
 	 * Requires a mmap handler. This prevents people from using a.out
 	 * as part of an exploit attack against /proc-related vulnerabilities.
 	 */
-	if (!file->f_op || !file->f_op->mmap)
+	if (!file->f_op->mmap)
 		goto out;
 
 	if (N_FLAGS(ex))
@@ -406,33 +385,24 @@ static int load_aout_library(struct file *file)
 	start_addr =  ex.a_entry & 0xfffff000;
 
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		loff_t pos = N_TXTOFF(ex);
-
 		if (printk_ratelimit())
 		{
 			printk(KERN_WARNING 
 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
 			       file->f_path.dentry->d_name.name);
 		}
-		down_write(&current->mm->mmap_sem);
-		do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		up_write(&current->mm->mmap_sem);
+		vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
 		
-		file->f_op->read(file, (char __user *)start_addr,
-			ex.a_text + ex.a_data, &pos);
-		flush_icache_range((unsigned long) start_addr,
-				   (unsigned long) start_addr + ex.a_text + ex.a_data);
-
+		read_code(file, start_addr, N_TXTOFF(ex),
+			  ex.a_text + ex.a_data);
 		retval = 0;
 		goto out;
 	}
 	/* Now use mmap to map the library into memory. */
-	down_write(&current->mm->mmap_sem);
-	error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
+	error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
 			PROT_READ | PROT_WRITE | PROT_EXEC,
 			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
 			N_TXTOFF(ex));
-	up_write(&current->mm->mmap_sem);
 	retval = error;
 	if (error != start_addr)
 		goto out;
@@ -440,9 +410,7 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(start_addr + len, bss - len);
-		up_write(&current->mm->mmap_sem);
+		error = vm_brk(start_addr + len, bss - len);
 		retval = error;
 		if (error != start_addr + len)
 			goto out;
@@ -454,7 +422,8 @@ out:
 
 static int __init init_aout_binfmt(void)
 {
-	return register_binfmt(&aout_format);
+	register_binfmt(&aout_format);
+	return 0;
 }
 
 static void __exit exit_aout_binfmt(void)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f34078d702d..3892c1a2324 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -27,20 +27,34 @@
 #include <linux/compiler.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
-static int load_elf_library(struct file *);
+#ifndef user_long_t
+#define user_long_t long
+#endif
+#ifndef user_siginfo_t
+#define user_siginfo_t siginfo_t
+#endif
+
+static int load_elf_binary(struct linux_binprm *bprm);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
 				int, int, unsigned long);
 
+#ifdef CONFIG_USELIB
+static int load_elf_library(struct file *);
+#else
+#define load_elf_library NULL
+#endif
+
 /*
  * If we don't support core dumping, then supply a NULL so we
  * don't even try.
@@ -81,9 +95,7 @@ static int set_brk(unsigned long start, unsigned long end)
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
 		unsigned long addr;
-		down_write(&current->mm->mmap_sem);
-		addr = do_brk(start, end - start);
-		up_write(&current->mm->mmap_sem);
+		addr = vm_brk(start, end - start);
 		if (BAD_ADDR(addr))
 			return addr;
 	}
@@ -227,12 +239,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, cred->uid);
-	NEW_AUX_ENT(AT_EUID, cred->euid);
-	NEW_AUX_ENT(AT_GID, cred->gid);
-	NEW_AUX_ENT(AT_EGID, cred->egid);
+	NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
+	NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
+	NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
+	NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
+#ifdef ELF_HWCAP2
+	NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+#endif
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
@@ -315,6 +330,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	return 0;
 }
 
+#ifndef elf_map
+
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
@@ -330,7 +347,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	if (!size)
 		return addr;
 
-	down_write(&current->mm->mmap_sem);
 	/*
 	* total_size is the size of the ELF (interpreter) image.
 	* The _first_ mmap needs to know the full size, otherwise
@@ -341,16 +357,17 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	*/
 	if (total_size) {
 		total_size = ELF_PAGEALIGN(total_size);
-		map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
 		if (!BAD_ADDR(map_addr))
-			do_munmap(current->mm, map_addr+size, total_size-size);
+			vm_munmap(map_addr+size, total_size-size);
 	} else
-		map_addr = do_mmap(filep, addr, size, prot, type, off);
+		map_addr = vm_mmap(filep, addr, size, prot, type, off);
 
-	up_write(&current->mm->mmap_sem);
 	return(map_addr);
 }
 
+#endif /* !elf_map */
+
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
 	int i, first_idx = -1, last_idx = -1;
@@ -394,7 +411,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		goto out;
 	if (!elf_check_arch(interp_elf_ex))
 		goto out;
-	if (!interpreter->f_op || !interpreter->f_op->mmap)
+	if (!interpreter->f_op->mmap)
 		goto out;
 
 	/*
@@ -513,9 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
 		/* Map the last of the bss segment */
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(elf_bss, last_bss - elf_bss);
-		up_write(&current->mm->mmap_sem);
+		error = vm_brk(elf_bss, last_bss - elf_bss);
 		if (BAD_ADDR(error))
 			goto out_close;
 	}
@@ -533,9 +548,6 @@ out:
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-#define INTERPRETER_NONE 0
-#define INTERPRETER_ELF 2
-
 #ifndef STACK_RND_MASK
 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))	/* 8MB of VA */
 #endif
@@ -556,7 +568,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
 
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_elf_binary(struct linux_binprm *bprm)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
  	unsigned long load_addr = 0, load_bias = 0;
@@ -572,7 +584,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long reloc_func_desc __maybe_unused = 0;
 	int executable_stack = EXSTACK_DEFAULT;
-	unsigned long def_flags = 0;
+	struct pt_regs *regs = current_pt_regs();
 	struct {
 		struct elfhdr elf_ex;
 		struct elfhdr interp_elf_ex;
@@ -596,7 +608,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
-	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+	if (!bprm->file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -668,8 +680,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * mm->dumpable = 0 regardless of the interpreter's
 			 * permissions.
 			 */
-			if (file_permission(interpreter, MAY_READ) < 0)
-				bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+			would_dump(bprm, interpreter);
 
 			retval = kernel_read(interpreter, 0, bprm->buf,
 					     BINPRM_BUF_SIZE);
@@ -712,10 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (retval)
 		goto out_free_dentry;
 
-	/* OK, This is the point of no return */
-	current->flags &= ~PF_FORKNOEXEC;
-	current->mm->def_flags = def_flags;
-
 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
 	   may depend on the personality.  */
 	SET_PERSONALITY(loc->elf_ex);
@@ -729,8 +736,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
@@ -795,8 +800,18 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
-			load_bias = 0;
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
+			/* Memory randomization might have been switched off
+			 * in runtime via sysctl or explicit setting of
+			 * personality flags.
+			 * If that is the case, retain the original non-zero
+			 * load_bias value in order to establish proper
+			 * non-randomized mappings.
+			 */
+			if (current->flags & PF_RANDOMIZE)
+				load_bias = 0;
+			else
+				load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #endif
@@ -879,7 +894,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 
 	if (elf_interpreter) {
-		unsigned long uninitialized_var(interp_map_addr);
+		unsigned long interp_map_addr = 0;
 
 		elf_entry = load_elf_interp(&loc->interp_elf_ex,
 					    interpreter,
@@ -926,7 +941,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
 
 	install_exec_creds(bprm);
-	current->flags &= ~PF_FORKNOEXEC;
 	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  load_addr, interp_load_addr);
 	if (retval < 0) {
@@ -941,9 +955,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->start_stack = bprm->p;
 
 #ifdef arch_randomize_brk
-	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
+	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
 		current->mm->brk = current->mm->start_brk =
 			arch_randomize_brk(current->mm);
+#ifdef CONFIG_COMPAT_BRK
+		current->brk_randomized = 1;
+#endif
+	}
 #endif
 
 	if (current->personality & MMAP_PAGE_ZERO) {
@@ -951,10 +969,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		   and some applications "depend" upon this behavior.
 		   Since we do not have the power to recompile these, we
 		   emulate the SVr4 behavior. Sigh. */
-		down_write(&current->mm->mmap_sem);
-		error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
+		error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
 				MAP_FIXED | MAP_PRIVATE, 0);
-		up_write(&current->mm->mmap_sem);
 	}
 
 #ifdef ELF_PLAT_INIT
@@ -990,6 +1006,7 @@ out_free_ph:
 	goto out;
 }
 
+#ifdef CONFIG_USELIB
 /* This is really simpleminded and specialized - we are loading an
    a.out library that is given an ELF header. */
 static int load_elf_library(struct file *file)
@@ -1010,7 +1027,7 @@ static int load_elf_library(struct file *file)
 
 	/* First of all, some simple consistency checks */
 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
-	    !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+	    !elf_check_arch(&elf_ex) || !file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -1039,8 +1056,7 @@ static int load_elf_library(struct file *file)
 		eppnt++;
 
 	/* Now use mmap to map the library into memory. */
-	down_write(&current->mm->mmap_sem);
-	error = do_mmap(file,
+	error = vm_mmap(file,
 			ELF_PAGESTART(eppnt->p_vaddr),
 			(eppnt->p_filesz +
 			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
@@ -1048,7 +1064,6 @@ static int load_elf_library(struct file *file)
 			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
 			(eppnt->p_offset -
 			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
-	up_write(&current->mm->mmap_sem);
 	if (error != ELF_PAGESTART(eppnt->p_vaddr))
 		goto out_free_ph;
 
@@ -1061,11 +1076,8 @@ static int load_elf_library(struct file *file)
 	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
 			    ELF_MIN_ALIGN - 1);
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
-	if (bss > len) {
-		down_write(&current->mm->mmap_sem);
-		do_brk(len, bss - len);
-		up_write(&current->mm->mmap_sem);
-	}
+	if (bss > len)
+		vm_brk(len, bss - len);
 	error = 0;
 
 out_free_ph:
@@ -1073,6 +1085,7 @@ out_free_ph:
 out:
 	return error;
 }
+#endif /* #ifdef CONFIG_USELIB */
 
 #ifdef CONFIG_ELF_CORE
 /*
@@ -1083,6 +1096,37 @@ out:
  */
 
 /*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+	/* Any vsyscall mappings? */
+	if (vma == get_gate_vma(vma->vm_mm))
+		return true;
+
+	/*
+	 * Assume that all vmas with a .name op should always be dumped.
+	 * If this changes, a new vm_ops field can easily be added.
+	 */
+	if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
+		return true;
+
+	/*
+	 * arch_vma_name() returns non-NULL for special architecture mappings,
+	 * such as vDSO sections.
+	 */
+	if (arch_vma_name(vma))
+		return true;
+
+	return false;
+}
+
+/*
  * Decide what to dump of a segment, part, all or none.
  */
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
@@ -1090,25 +1134,29 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 {
 #define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
 
-	/* The vma can be set up to tell us the answer directly.  */
-	if (vma->vm_flags & VM_ALWAYSDUMP)
+	/* always dump the vdso and vsyscall sections */
+	if (always_dump_vma(vma))
 		goto whole;
 
+	if (vma->vm_flags & VM_DONTDUMP)
+		return 0;
+
 	/* Hugetlb memory check */
 	if (vma->vm_flags & VM_HUGETLB) {
 		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
 			goto whole;
 		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
 			goto whole;
+		return 0;
 	}
 
 	/* Do not dump I/O mapped devices or special mappings */
-	if (vma->vm_flags & (VM_IO | VM_RESERVED))
+	if (vma->vm_flags & VM_IO)
 		return 0;
 
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
-		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
+		if (file_inode(vma->vm_file)->i_nlink == 0 ?
 		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
 			goto whole;
 		return 0;
@@ -1185,38 +1233,20 @@ static int notesize(struct memelfnote *en)
 	return sz;
 }
 
-#define DUMP_WRITE(addr, nr, foffset)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-
-static int alignfile(struct file *file, loff_t *foffset)
-{
-	static const char buf[4] = { 0, };
-	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
-	return 1;
-}
-
-static int writenote(struct memelfnote *men, struct file *file,
-			loff_t *foffset)
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 {
 	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en), foffset);
-	DUMP_WRITE(men->name, en.n_namesz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-	DUMP_WRITE(men->data, men->datasz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-
-	return 1;
+	return dump_emit(cprm, &en, sizeof(en)) &&
+	    dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+	    dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
 }
-#undef DUMP_WRITE
 
 static void fill_elf_header(struct elfhdr *elf, int segs,
-			    u16 machine, u32 flags, u8 osabi)
+			    u16 machine, u32 flags)
 {
 	memset(elf, 0, sizeof(*elf));
 
@@ -1288,8 +1318,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
-		cputime_to_timeval(p->utime, &prstatus->pr_utime);
-		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
 	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
 	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1330,8 +1363,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_flag = p->flags;
 	rcu_read_lock();
 	cred = __task_cred(p);
-	SET_UID(psinfo->pr_uid, cred->uid);
-	SET_GID(psinfo->pr_gid, cred->gid);
+	SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
 	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 	
@@ -1348,6 +1381,103 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
 	fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
 }
 
+static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
+		const siginfo_t *siginfo)
+{
+	mm_segment_t old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
+	set_fs(old_fs);
+	fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+}
+
+#define MAX_FILE_NOTE_SIZE (4*1024*1024)
+/*
+ * Format of NT_FILE note:
+ *
+ * long count     -- how many files are mapped
+ * long page_size -- units for file_ofs
+ * array of [COUNT] elements of
+ *   long start
+ *   long end
+ *   long file_ofs
+ * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
+ */
+static int fill_files_note(struct memelfnote *note)
+{
+	struct vm_area_struct *vma;
+	unsigned count, size, names_ofs, remaining, n;
+	user_long_t *data;
+	user_long_t *start_end_ofs;
+	char *name_base, *name_curpos;
+
+	/* *Estimated* file count and total data size needed */
+	count = current->mm->map_count;
+	size = count * 64;
+
+	names_ofs = (2 + 3 * count) * sizeof(data[0]);
+ alloc:
+	if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+		return -EINVAL;
+	size = round_up(size, PAGE_SIZE);
+	data = vmalloc(size);
+	if (!data)
+		return -ENOMEM;
+
+	start_end_ofs = data + 2;
+	name_base = name_curpos = ((char *)data) + names_ofs;
+	remaining = size - names_ofs;
+	count = 0;
+	for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+		struct file *file;
+		const char *filename;
+
+		file = vma->vm_file;
+		if (!file)
+			continue;
+		filename = d_path(&file->f_path, name_curpos, remaining);
+		if (IS_ERR(filename)) {
+			if (PTR_ERR(filename) == -ENAMETOOLONG) {
+				vfree(data);
+				size = size * 5 / 4;
+				goto alloc;
+			}
+			continue;
+		}
+
+		/* d_path() fills at the end, move name down */
+		/* n = strlen(filename) + 1: */
+		n = (name_curpos + remaining) - filename;
+		remaining = filename - name_curpos;
+		memmove(name_curpos, filename, n);
+		name_curpos += n;
+
+		*start_end_ofs++ = vma->vm_start;
+		*start_end_ofs++ = vma->vm_end;
+		*start_end_ofs++ = vma->vm_pgoff;
+		count++;
+	}
+
+	/* Now we know exact count of files, can store it */
+	data[0] = count;
+	data[1] = PAGE_SIZE;
+	/*
+	 * Count usually is less than current->mm->map_count,
+	 * we need to move filenames down.
+	 */
+	n = current->mm->map_count - count;
+	if (n != 0) {
+		unsigned shift_bytes = n * 3 * sizeof(data[0]);
+		memmove(name_base - shift_bytes, name_base,
+			name_curpos - name_base);
+		name_curpos -= shift_bytes;
+	}
+
+	size = name_curpos - (char *)data;
+	fill_note(note, "CORE", NT_FILE, size, data);
+	return 0;
+}
+
 #ifdef CORE_DUMP_USE_REGSET
 #include <linux/regset.h>
 
@@ -1361,7 +1491,10 @@ struct elf_thread_core_info {
 struct elf_note_info {
 	struct elf_thread_core_info *thread;
 	struct memelfnote psinfo;
+	struct memelfnote signote;
 	struct memelfnote auxv;
+	struct memelfnote files;
+	user_siginfo_t csigdata;
 	size_t size;
 	int thread_notes;
 };
@@ -1378,6 +1511,22 @@ static void do_thread_regset_writeback(struct task_struct *task,
 		regset->writeback(task, regset, 1);
 }
 
+#ifndef PR_REG_SIZE
+#define PR_REG_SIZE(S) sizeof(S)
+#endif
+
+#ifndef PRSTATUS_SIZE
+#define PRSTATUS_SIZE(S) sizeof(S)
+#endif
+
+#ifndef PR_REG_PTR
+#define PR_REG_PTR(S) (&((S)->pr_reg))
+#endif
+
+#ifndef SET_PR_FPVALID
+#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#endif
+
 static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 const struct user_regset_view *view,
 				 long signr, size_t *total)
@@ -1392,11 +1541,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
 	(void) view->regsets[0].get(t->task, &view->regsets[0],
-				    0, sizeof(t->prstatus.pr_reg),
-				    &t->prstatus.pr_reg, NULL);
+				    0, PR_REG_SIZE(t->prstatus.pr_reg),
+				    PR_REG_PTR(&t->prstatus), NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  sizeof(t->prstatus), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1409,7 +1558,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	for (i = 1; i < view->n; ++i) {
 		const struct user_regset *regset = &view->regsets[i];
 		do_thread_regset_writeback(t->task, regset);
-		if (regset->core_note_type &&
+		if (regset->core_note_type && regset->get &&
 		    (!regset->active || regset->active(t->task, regset))) {
 			int ret;
 			size_t size = regset->n * regset->size;
@@ -1426,7 +1575,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  regset->core_note_type,
 						  size, data);
 				else {
-					t->prstatus.pr_fpvalid = 1;
+					SET_PR_FPVALID(&t->prstatus, 1);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
@@ -1440,7 +1589,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
-			  long signr, struct pt_regs *regs)
+			  const siginfo_t *siginfo, struct pt_regs *regs)
 {
 	struct task_struct *dump_task = current;
 	const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1453,8 +1602,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	info->thread = NULL;
 
 	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-	if (psinfo == NULL)
+	if (psinfo == NULL) {
+		info->psinfo.data = NULL; /* So we don't free this wrongly */
 		return 0;
+	}
 
 	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
 
@@ -1480,7 +1631,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	 * Initialize the ELF file header.
 	 */
 	fill_elf_header(elf, phdrs,
-			view->e_machine, view->e_flags, view->ei_osabi);
+			view->e_machine, view->e_flags);
 
 	/*
 	 * Allocate a structure for each thread.
@@ -1510,7 +1661,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	 * Now fill in each thread's information.
 	 */
 	for (t = info->thread; t != NULL; t = t->next)
-		if (!fill_thread_core_info(t, view, signr, &info->size))
+		if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
 			return 0;
 
 	/*
@@ -1519,9 +1670,15 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
 	info->size += notesize(&info->psinfo);
 
+	fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+	info->size += notesize(&info->signote);
+
 	fill_auxv_note(&info->auxv, current->mm);
 	info->size += notesize(&info->auxv);
 
+	if (fill_files_note(&info->files) == 0)
+		info->size += notesize(&info->files);
+
 	return 1;
 }
 
@@ -1535,28 +1692,33 @@ static size_t get_note_info_size(struct elf_note_info *info)
  * process-wide notes are interleaved after the first thread-specific note.
  */
 static int write_note_info(struct elf_note_info *info,
-			   struct file *file, loff_t *foffset)
+			   struct coredump_params *cprm)
 {
-	bool first = 1;
+	bool first = true;
 	struct elf_thread_core_info *t = info->thread;
 
 	do {
 		int i;
 
-		if (!writenote(&t->notes[0], file, foffset))
+		if (!writenote(&t->notes[0], cprm))
 			return 0;
 
-		if (first && !writenote(&info->psinfo, file, foffset))
+		if (first && !writenote(&info->psinfo, cprm))
+			return 0;
+		if (first && !writenote(&info->signote, cprm))
 			return 0;
-		if (first && !writenote(&info->auxv, file, foffset))
+		if (first && !writenote(&info->auxv, cprm))
+			return 0;
+		if (first && info->files.data &&
+				!writenote(&info->files, cprm))
 			return 0;
 
 		for (i = 1; i < info->thread_notes; ++i)
 			if (t->notes[i].data &&
-			    !writenote(&t->notes[i], file, foffset))
+			    !writenote(&t->notes[i], cprm))
 				return 0;
 
-		first = 0;
+		first = false;
 		t = t->next;
 	} while (t);
 
@@ -1576,6 +1738,7 @@ static void free_note_info(struct elf_note_info *info)
 		kfree(t);
 	}
 	kfree(info->psinfo.data);
+	vfree(info->files.data);
 }
 
 #else
@@ -1634,6 +1797,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 
 struct elf_note_info {
 	struct memelfnote *notes;
+	struct memelfnote *notes_files;
 	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
 	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
 	struct list_head thread_list;
@@ -1641,6 +1805,7 @@ struct elf_note_info {
 #ifdef ELF_CORE_COPY_XFPREGS
 	elf_fpxregset_t *xfpu;
 #endif
+	user_siginfo_t csigdata;
 	int thread_status_size;
 	int numnote;
 };
@@ -1650,76 +1815,62 @@ static int elf_note_info_init(struct elf_note_info *info)
 	memset(info, 0, sizeof(*info));
 	INIT_LIST_HEAD(&info->thread_list);
 
-	/* Allocate space for six ELF notes */
-	info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
+	/* Allocate space for ELF notes */
+	info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL);
 	if (!info->notes)
 		return 0;
 	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
 	if (!info->psinfo)
-		goto notes_free;
+		return 0;
 	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
 	if (!info->prstatus)
-		goto psinfo_free;
+		return 0;
 	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
 	if (!info->fpu)
-		goto prstatus_free;
+		return 0;
 #ifdef ELF_CORE_COPY_XFPREGS
 	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
 	if (!info->xfpu)
-		goto fpu_free;
+		return 0;
 #endif
 	return 1;
-#ifdef ELF_CORE_COPY_XFPREGS
- fpu_free:
-	kfree(info->fpu);
-#endif
- prstatus_free:
-	kfree(info->prstatus);
- psinfo_free:
-	kfree(info->psinfo);
- notes_free:
-	kfree(info->notes);
-	return 0;
 }
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
-			  long signr, struct pt_regs *regs)
+			  const siginfo_t *siginfo, struct pt_regs *regs)
 {
 	struct list_head *t;
+	struct core_thread *ct;
+	struct elf_thread_status *ets;
 
 	if (!elf_note_info_init(info))
 		return 0;
 
-	if (signr) {
-		struct core_thread *ct;
-		struct elf_thread_status *ets;
-
-		for (ct = current->mm->core_state->dumper.next;
-						ct; ct = ct->next) {
-			ets = kzalloc(sizeof(*ets), GFP_KERNEL);
-			if (!ets)
-				return 0;
+	for (ct = current->mm->core_state->dumper.next;
+					ct; ct = ct->next) {
+		ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+		if (!ets)
+			return 0;
 
-			ets->thread = ct->task;
-			list_add(&ets->list, &info->thread_list);
-		}
+		ets->thread = ct->task;
+		list_add(&ets->list, &info->thread_list);
+	}
 
-		list_for_each(t, &info->thread_list) {
-			int sz;
+	list_for_each(t, &info->thread_list) {
+		int sz;
 
-			ets = list_entry(t, struct elf_thread_status, list);
-			sz = elf_dump_thread_status(signr, ets);
-			info->thread_status_size += sz;
-		}
+		ets = list_entry(t, struct elf_thread_status, list);
+		sz = elf_dump_thread_status(siginfo->si_signo, ets);
+		info->thread_status_size += sz;
 	}
 	/* now collect the dump for the current */
 	memset(info->prstatus, 0, sizeof(*info->prstatus));
-	fill_prstatus(info->prstatus, current, signr);
+	fill_prstatus(info->prstatus, current, siginfo->si_signo);
 	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
 
 	/* Set up header */
-	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
 
 	/*
 	 * Set up the notes in similar form to SVR4 core dumps made
@@ -1732,9 +1883,14 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
 		  sizeof(*info->psinfo), info->psinfo);
 
-	info->numnote = 2;
+	fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+	fill_auxv_note(info->notes + 3, current->mm);
+	info->numnote = 4;
 
-	fill_auxv_note(&info->notes[info->numnote++], current->mm);
+	if (fill_files_note(info->notes + info->numnote) == 0) {
+		info->notes_files = info->notes + info->numnote;
+		info->numnote++;
+	}
 
 	/* Try to dump the FPU. */
 	info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
@@ -1766,13 +1922,13 @@ static size_t get_note_info_size(struct elf_note_info *info)
 }
 
 static int write_note_info(struct elf_note_info *info,
-			   struct file *file, loff_t *foffset)
+			   struct coredump_params *cprm)
 {
 	int i;
 	struct list_head *t;
 
 	for (i = 0; i < info->numnote; i++)
-		if (!writenote(info->notes + i, file, foffset))
+		if (!writenote(info->notes + i, cprm))
 			return 0;
 
 	/* write out the thread status notes section */
@@ -1781,7 +1937,7 @@ static int write_note_info(struct elf_note_info *info,
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], file, foffset))
+			if (!writenote(&tmp->notes[i], cprm))
 				return 0;
 	}
 
@@ -1796,6 +1952,10 @@ static void free_note_info(struct elf_note_info *info)
 		kfree(list_entry(tmp, struct elf_thread_status, list));
 	}
 
+	/* Free data possibly allocated by fill_files_note(): */
+	if (info->notes_files)
+		vfree(info->notes_files->data);
+
 	kfree(info->prstatus);
 	kfree(info->psinfo);
 	kfree(info->notes);
@@ -1873,11 +2033,10 @@ static int elf_core_dump(struct coredump_params *cprm)
 	int has_dumped = 0;
 	mm_segment_t fs;
 	int segs;
-	size_t size = 0;
 	struct vm_area_struct *vma, *gate_vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff, foffset;
-	struct elf_note_info info;
+	loff_t offset = 0, dataoff;
+	struct elf_note_info info = { };
 	struct elf_phdr *phdr4note = NULL;
 	struct elf_shdr *shdr4extnum = NULL;
 	Elf_Half e_phnum;
@@ -1922,18 +2081,16 @@ static int elf_core_dump(struct coredump_params *cprm)
 	 * Collect all the non-memory information about the process for the
 	 * notes.  This also sets up the file header.
 	 */
-	if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
+	if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
 		goto cleanup;
 
 	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
-  
+
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
 	offset += sizeof(*elf);				/* Elf header */
 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
-	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
@@ -1964,13 +2121,10 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 	offset = dataoff;
 
-	size += sizeof(*elf);
-	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+	if (!dump_emit(cprm, elf, sizeof(*elf)))
 		goto end_coredump;
 
-	size += sizeof(*phdr4note);
-	if (size > cprm->limit
-	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
 		goto end_coredump;
 
 	/* Write program headers for segments dump */
@@ -1992,24 +2146,22 @@ static int elf_core_dump(struct coredump_params *cprm)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
-		size += sizeof(phdr);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 			goto end_coredump;
 	}
 
-	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
  	/* write out the notes section */
-	if (!write_note_info(&info, cprm->file, &foffset))
+	if (!write_note_info(&info, cprm))
 		goto end_coredump;
 
-	if (elf_coredump_extra_notes_write(cprm->file, &foffset))
+	if (elf_coredump_extra_notes_write(cprm))
 		goto end_coredump;
 
 	/* Align to page */
-	if (!dump_seek(cprm->file, dataoff - foffset))
+	if (!dump_skip(cprm, dataoff - cprm->written))
 		goto end_coredump;
 
 	for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -2026,26 +2178,21 @@ static int elf_core_dump(struct coredump_params *cprm)
 			page = get_dump_page(addr);
 			if (page) {
 				void *kaddr = kmap(page);
-				stop = ((size += PAGE_SIZE) > cprm->limit) ||
-					!dump_write(cprm->file, kaddr,
-						    PAGE_SIZE);
+				stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
 				kunmap(page);
 				page_cache_release(page);
 			} else
-				stop = !dump_seek(cprm->file, PAGE_SIZE);
+				stop = !dump_skip(cprm, PAGE_SIZE);
 			if (stop)
 				goto end_coredump;
 		}
 	}
 
-	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
 
 	if (e_phnum == PN_XNUM) {
-		size += sizeof(*shdr4extnum);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, shdr4extnum,
-				   sizeof(*shdr4extnum)))
+		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
 			goto end_coredump;
 	}
 
@@ -2065,7 +2212,8 @@ out:
 
 static int __init init_elf_binfmt(void)
 {
-	return register_binfmt(&elf_format);
+	register_binfmt(&elf_format);
+	return 0;
 }
 
 static void __exit exit_elf_binfmt(void)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 63039ed9576..fe2a643ee00 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
 
 MODULE_LICENSE("GPL");
 
-static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
+static int load_elf_fdpic_binary(struct linux_binprm *);
 static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
 static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
 			      struct mm_struct *, const char *);
@@ -91,7 +91,8 @@ static struct linux_binfmt elf_fdpic_format = {
 
 static int __init init_elf_fdpic_binfmt(void)
 {
-	return register_binfmt(&elf_fdpic_format);
+	register_binfmt(&elf_fdpic_format);
+	return 0;
 }
 
 static void __exit exit_elf_fdpic_binfmt(void)
@@ -110,7 +111,7 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
 		return 0;
 	if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
 		return 0;
-	if (!file->f_op || !file->f_op->mmap)
+	if (!file->f_op->mmap)
 		return 0;
 	return 1;
 }
@@ -163,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
 /*
  * load an fdpic binary into various bits of memory
  */
-static int load_elf_fdpic_binary(struct linux_binprm *bprm,
-				 struct pt_regs *regs)
+static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 {
 	struct elf_fdpic_params exec_params, interp_params;
+	struct pt_regs *regs = current_pt_regs();
 	struct elf_phdr *phdr;
 	unsigned long stack_size, entryaddr;
 #ifdef ELF_FDPIC_PLAT_INIT
@@ -245,8 +246,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 			 * mm->dumpable = 0 regardless of the interpreter's
 			 * permissions.
 			 */
-			if (file_permission(interpreter, MAY_READ) < 0)
-				bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+			would_dump(bprm, interpreter);
 
 			retval = kernel_read(interpreter, 0, bprm->buf,
 					     BINPRM_BUF_SIZE);
@@ -335,8 +335,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	current->mm->context.exec_fdpic_loadmap = 0;
 	current->mm->context.interp_fdpic_loadmap = 0;
 
-	current->flags &= ~PF_FORKNOEXEC;
-
 #ifdef CONFIG_MMU
 	elf_fdpic_arch_lay_out_mm(&exec_params,
 				  &interp_params,
@@ -391,21 +389,17 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	    (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
 		stack_prot |= PROT_EXEC;
 
-	down_write(&current->mm->mmap_sem);
-	current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
+	current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot,
 					 MAP_PRIVATE | MAP_ANONYMOUS |
 					 MAP_UNINITIALIZED | MAP_GROWSDOWN,
 					 0);
 
 	if (IS_ERR_VALUE(current->mm->start_brk)) {
-		up_write(&current->mm->mmap_sem);
 		retval = current->mm->start_brk;
 		current->mm->start_brk = 0;
 		goto error_kill;
 	}
 
-	up_write(&current->mm->mmap_sem);
-
 	current->mm->brk = current->mm->start_brk;
 	current->mm->context.end_brk = current->mm->start_brk;
 	current->mm->context.end_brk +=
@@ -414,7 +408,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 #endif
 
 	install_exec_creds(bprm);
-	current->flags &= ~PF_FORKNOEXEC;
 	if (create_elf_fdpic_tables(bprm, current->mm,
 				    &exec_params, &interp_params) < 0)
 		goto error_kill;
@@ -490,7 +483,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	size_t platform_len = 0, len;
 	char *k_platform, *k_base_platform;
 	char __user *u_platform, *u_base_platform, *p;
-	long hwcap;
 	int loop;
 	int nr;	/* reset for each csp adjustment */
 
@@ -509,8 +501,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 		return -EFAULT;
 #endif
 
-	hwcap = ELF_HWCAP;
-
 	/*
 	 * If this architecture has a platform capability string, copy it
 	 * to userspace.  In some cases (Sparc), this info is impossible
@@ -624,7 +614,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 
 	nr = 0;
 	csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
-	NEW_AUX_ENT(AT_HWCAP,	hwcap);
+	NEW_AUX_ENT(AT_HWCAP,	ELF_HWCAP);
+#ifdef ELF_HWCAP2
+	NEW_AUX_ENT(AT_HWCAP2,	ELF_HWCAP2);
+#endif
 	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
 	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
 	NEW_AUX_ENT(AT_PHDR,	exec_params->ph_addr);
@@ -633,10 +626,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
 	NEW_AUX_ENT(AT_FLAGS,	0);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT(AT_UID,	(elf_addr_t) cred->uid);
-	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) cred->euid);
-	NEW_AUX_ENT(AT_GID,	(elf_addr_t) cred->gid);
-	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) cred->egid);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
 	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
@@ -916,7 +909,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 
 dynamic_error:
 	printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n",
-	       what, file->f_path.dentry->d_inode->i_ino);
+	       what, file_inode(file)->i_ino);
 	return -ELIBBAD;
 }
 
@@ -933,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 	struct elf32_fdpic_loadseg *seg;
 	struct elf32_phdr *phdr;
 	unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
-	loff_t fpos;
 	int loop, ret;
 
 	load_addr = params->load_addr;
@@ -957,10 +949,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 	if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
 		mflags |= MAP_EXECUTABLE;
 
-	down_write(&mm->mmap_sem);
-	maddr = do_mmap(NULL, load_addr, top - base,
+	maddr = vm_mmap(NULL, load_addr, top - base,
 			PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
-	up_write(&mm->mmap_sem);
 	if (IS_ERR_VALUE(maddr))
 		return (int) maddr;
 
@@ -973,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 		if (params->phdrs[loop].p_type != PT_LOAD)
 			continue;
 
-		fpos = phdr->p_offset;
-
 		seg->addr = maddr + (phdr->p_vaddr - base);
 		seg->p_vaddr = phdr->p_vaddr;
 		seg->p_memsz = phdr->p_memsz;
 
-		ret = file->f_op->read(file, (void *) seg->addr,
-				       phdr->p_filesz, &fpos);
+		ret = read_code(file, seg->addr, phdr->p_offset,
+				       phdr->p_filesz);
 		if (ret < 0)
 			return ret;
 
@@ -1098,10 +1086,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 
 		/* create the mapping */
 		disp = phdr->p_vaddr & ~PAGE_MASK;
-		down_write(&mm->mmap_sem);
-		maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
+		maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
 				phdr->p_offset - disp);
-		up_write(&mm->mmap_sem);
 
 		kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
 		       loop, phdr->p_memsz + disp, prot, flags,
@@ -1145,10 +1131,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 			unsigned long xmaddr;
 
 			flags |= MAP_FIXED | MAP_ANONYMOUS;
-			down_write(&mm->mmap_sem);
-			xmaddr = do_mmap(NULL, xaddr, excess - excess1,
+			xmaddr = vm_mmap(NULL, xaddr, excess - excess1,
 					 prot, flags, 0);
-			up_write(&mm->mmap_sem);
 
 			kdebug("mmap[%d] <anon>"
 			       " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx",
@@ -1217,7 +1201,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
 	int dump_ok;
 
 	/* Do not dump I/O mapped devices or special mappings */
-	if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
+	if (vma->vm_flags & VM_IO) {
 		kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
 		return 0;
 	}
@@ -1232,7 +1216,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
 
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
-		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) {
+		if (file_inode(vma->vm_file)->i_nlink == 0) {
 			dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
 			kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
 			       vma->vm_flags, dump_ok ? "yes" : "no");
@@ -1283,35 +1267,17 @@ static int notesize(struct memelfnote *en)
 
 /* #define DEBUG */
 
-#define DUMP_WRITE(addr, nr, foffset)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-
-static int alignfile(struct file *file, loff_t *foffset)
-{
-	static const char buf[4] = { 0, };
-	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
-	return 1;
-}
-
-static int writenote(struct memelfnote *men, struct file *file,
-			loff_t *foffset)
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 {
 	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en), foffset);
-	DUMP_WRITE(men->name, en.n_namesz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-	DUMP_WRITE(men->data, men->datasz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-
-	return 1;
+	return dump_emit(cprm, &en, sizeof(en)) &&
+		dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+		dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
 }
-#undef DUMP_WRITE
 
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
 {
@@ -1388,8 +1354,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
-		cputime_to_timeval(p->utime, &prstatus->pr_utime);
-		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
 	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
 	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1433,8 +1402,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_flag = p->flags;
 	rcu_read_lock();
 	cred = __task_cred(p);
-	SET_UID(psinfo->pr_uid, cred->uid);
-	SET_GID(psinfo->pr_gid, cred->gid);
+	SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
 	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 
@@ -1513,66 +1482,40 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
 /*
  * dump the segments for an MMU process
  */
-#ifdef CONFIG_MMU
-static int elf_fdpic_dump_segments(struct file *file, size_t *size,
-			   unsigned long *limit, unsigned long mm_flags)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
 {
 	struct vm_area_struct *vma;
-	int err = 0;
 
 	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		unsigned long addr;
 
-		if (!maydump(vma, mm_flags))
+		if (!maydump(vma, cprm->mm_flags))
 			continue;
 
+#ifdef CONFIG_MMU
 		for (addr = vma->vm_start; addr < vma->vm_end;
 							addr += PAGE_SIZE) {
+			bool res;
 			struct page *page = get_dump_page(addr);
 			if (page) {
 				void *kaddr = kmap(page);
-				*size += PAGE_SIZE;
-				if (*size > *limit)
-					err = -EFBIG;
-				else if (!dump_write(file, kaddr, PAGE_SIZE))
-					err = -EIO;
+				res = dump_emit(cprm, kaddr, PAGE_SIZE);
 				kunmap(page);
 				page_cache_release(page);
-			} else if (!dump_seek(file, PAGE_SIZE))
-				err = -EFBIG;
-			if (err)
-				goto out;
+			} else {
+				res = dump_skip(cprm, PAGE_SIZE);
+			}
+			if (!res)
+				return false;
 		}
-	}
-out:
-	return err;
-}
-#endif
-
-/*
- * dump the segments for a NOMMU process
- */
-#ifndef CONFIG_MMU
-static int elf_fdpic_dump_segments(struct file *file, size_t *size,
-			   unsigned long *limit, unsigned long mm_flags)
-{
-	struct vm_area_struct *vma;
-
-	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-		if (!maydump(vma, mm_flags))
-			continue;
-
-		if ((*size += PAGE_SIZE) > *limit)
-			return -EFBIG;
-
-		if (!dump_write(file, (void *) vma->vm_start,
+#else
+		if (!dump_emit(cprm, (void *) vma->vm_start,
 				vma->vm_end - vma->vm_start))
-			return -EIO;
+			return false;
+#endif
 	}
-
-	return 0;
+	return true;
 }
-#endif
 
 static size_t elf_core_vma_data_size(unsigned long mm_flags)
 {
@@ -1598,11 +1541,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	int has_dumped = 0;
 	mm_segment_t fs;
 	int segs;
-	size_t size = 0;
 	int i;
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff, foffset;
+	loff_t offset = 0, dataoff;
 	int numnote;
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
@@ -1619,6 +1561,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	struct elf_shdr *shdr4extnum = NULL;
 	Elf_Half e_phnum;
 	elf_addr_t e_shoff;
+	struct core_thread *ct;
+	struct elf_thread_status *tmp;
 
 	/*
 	 * We no longer stop all VM operations.
@@ -1654,32 +1598,27 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 		goto cleanup;
 #endif
 
-	if (cprm->signr) {
-		struct core_thread *ct;
-		struct elf_thread_status *tmp;
-
-		for (ct = current->mm->core_state->dumper.next;
-						ct; ct = ct->next) {
-			tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
-			if (!tmp)
-				goto cleanup;
+	for (ct = current->mm->core_state->dumper.next;
+					ct; ct = ct->next) {
+		tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+		if (!tmp)
+			goto cleanup;
 
-			tmp->thread = ct->task;
-			list_add(&tmp->list, &thread_list);
-		}
+		tmp->thread = ct->task;
+		list_add(&tmp->list, &thread_list);
+	}
 
-		list_for_each(t, &thread_list) {
-			struct elf_thread_status *tmp;
-			int sz;
+	list_for_each(t, &thread_list) {
+		struct elf_thread_status *tmp;
+		int sz;
 
-			tmp = list_entry(t, struct elf_thread_status, list);
-			sz = elf_dump_thread_status(cprm->signr, tmp);
-			thread_status_size += sz;
-		}
+		tmp = list_entry(t, struct elf_thread_status, list);
+		sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
+		thread_status_size += sz;
 	}
 
 	/* now collect the dump for the current */
-	fill_prstatus(prstatus, current, cprm->signr);
+	fill_prstatus(prstatus, current, cprm->siginfo->si_signo);
 	elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
 
 	segs = current->mm->map_count;
@@ -1697,8 +1636,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	fill_elf_fdpic_header(elf, e_phnum);
 
 	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
-
 	/*
 	 * Set up the notes in similar form to SVR4 core dumps made
 	 * with info from their /proc.
@@ -1735,7 +1672,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 
 	offset += sizeof(*elf);				/* Elf header */
 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
-	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
@@ -1770,13 +1706,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 
 	offset = dataoff;
 
-	size += sizeof(*elf);
-	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+	if (!dump_emit(cprm, elf, sizeof(*elf)))
 		goto end_coredump;
 
-	size += sizeof(*phdr4note);
-	if (size > cprm->limit
-	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
 		goto end_coredump;
 
 	/* write program headers for segments dump */
@@ -1800,18 +1733,16 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
-		size += sizeof(phdr);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 			goto end_coredump;
 	}
 
-	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
-		if (!writenote(notes + i, cprm->file, &foffset))
+		if (!writenote(notes + i, cprm))
 			goto end_coredump;
 
 	/* write out the thread status notes section */
@@ -1820,25 +1751,21 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], cprm->file, &foffset))
+			if (!writenote(&tmp->notes[i], cprm))
 				goto end_coredump;
 	}
 
-	if (!dump_seek(cprm->file, dataoff - foffset))
+	if (!dump_skip(cprm, dataoff - cprm->written))
 		goto end_coredump;
 
-	if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
-				    cprm->mm_flags) < 0)
+	if (!elf_fdpic_dump_segments(cprm))
 		goto end_coredump;
 
-	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
 
 	if (e_phnum == PN_XNUM) {
-		size += sizeof(*shdr4extnum);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, shdr4extnum,
-				   sizeof(*shdr4extnum)))
+		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
 			goto end_coredump;
 	}
 
@@ -1864,6 +1791,7 @@ cleanup:
 	kfree(psinfo);
 	kfree(notes);
 	kfree(fpu);
+	kfree(shdr4extnum);
 #ifdef ELF_CORE_COPY_XFPREGS
 	kfree(xfpu);
 #endif
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index b8e8b0acf9b..f37b08cea1f 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
 #define EM86_INTERP	"/usr/bin/em86"
 #define EM86_I_NAME	"em86"
 
-static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_em86(struct linux_binprm *bprm)
 {
 	char *interp, *i_name, *i_arg;
 	struct file * file;
@@ -38,11 +38,10 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
 	/* First of all, some simple consistency checks */
 	if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
 		(!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
-		(!bprm->file->f_op || !bprm->file->f_op->mmap)) {
+		!bprm->file->f_op->mmap) {
 			return -ENOEXEC;
 	}
 
-	bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
@@ -90,7 +89,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
 	if (retval < 0)
 		return retval;
 
-	return search_binary_handler(bprm, regs);
+	return search_binary_handler(bprm);
 }
 
 static struct linux_binfmt em86_format = {
@@ -100,7 +99,8 @@ static struct linux_binfmt em86_format = {
 
 static int __init init_em86_binfmt(void)
 {
-	return register_binfmt(&em86_format);
+	register_binfmt(&em86_format);
+	return 0;
 }
 
 static void __exit exit_em86_binfmt(void)
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 811384bec8d..f723cd3a455 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,7 @@
  *	JAN/99 -- coded full program relocation (gerg@snapgear.com)
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -37,7 +37,6 @@
 #include <linux/syscalls.h>
 
 #include <asm/byteorder.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <asm/cacheflush.h>
@@ -89,7 +88,7 @@ struct lib_info {
 static int load_flat_shared_library(int id, struct lib_info *p);
 #endif
 
-static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_flat_binary(struct linux_binprm *);
 static int flat_core_dump(struct coredump_params *cprm);
 
 static struct linux_binfmt flat_format = {
@@ -108,7 +107,7 @@ static struct linux_binfmt flat_format = {
 static int flat_core_dump(struct coredump_params *cprm)
 {
 	printk("Process %s:%d received signr %d and should have core dumped\n",
-			current->comm, current->pid, (int) cprm->signr);
+			current->comm, current->pid, (int) cprm->siginfo->si_signo);
 	return(1);
 }
 
@@ -208,11 +207,12 @@ static int decompress_exec(
 
 	/* Read in first chunk of data and parse gzip header. */
 	fpos = offset;
-	ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+	ret = kernel_read(bprm->file, offset, buf, LBUFSIZE);
 
 	strm.next_in = buf;
 	strm.avail_in = ret;
 	strm.total_in = 0;
+	fpos += ret;
 
 	retval = -ENOEXEC;
 
@@ -278,7 +278,7 @@ static int decompress_exec(
 	}
 
 	while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
-		ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+		ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE);
 		if (ret <= 0)
 			break;
 		len -= ret;
@@ -286,6 +286,7 @@ static int decompress_exec(
 		strm.next_in = buf;
 		strm.avail_in = ret;
 		strm.total_in = 0;
+		fpos += ret;
 	}
 
 	if (ret < 0) {
@@ -379,7 +380,7 @@ failed:
 
 /****************************************************************************/
 
-void old_reloc(unsigned long rl)
+static void old_reloc(unsigned long rl)
 {
 #ifdef DEBUG
 	char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
@@ -429,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	unsigned long textpos = 0, datapos = 0, result;
 	unsigned long realdatastart = 0;
 	unsigned long text_len, data_len, bss_len, stack_len, flags;
+	unsigned long full_data;
 	unsigned long len, memp = 0;
 	unsigned long memp_size, extra, rlim;
 	unsigned long *reloc = 0, *rp;
@@ -439,7 +441,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	int ret;
 
 	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */
-	inode = bprm->file->f_path.dentry->d_inode;
+	inode = file_inode(bprm->file);
 
 	text_len  = ntohl(hdr->data_start);
 	data_len  = ntohl(hdr->data_end) - ntohl(hdr->data_start);
@@ -452,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	relocs    = ntohl(hdr->reloc_count);
 	flags     = ntohl(hdr->flags);
 	rev       = ntohl(hdr->rev);
+	full_data = data_len + relocs * sizeof(unsigned long);
 
 	if (strncmp(hdr->magic, "bFLT", 4)) {
 		/*
@@ -543,10 +546,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 		 */
 		DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
 
-		down_write(&current->mm->mmap_sem);
-		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
+		textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
 				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
-		up_write(&current->mm->mmap_sem);
 		if (!textpos || IS_ERR_VALUE(textpos)) {
 			if (!textpos)
 				textpos = (unsigned long) -ENOMEM;
@@ -557,17 +558,15 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 		len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
 		len = PAGE_ALIGN(len);
-		down_write(&current->mm->mmap_sem);
-		realdatastart = do_mmap(0, 0, len,
+		realdatastart = vm_mmap(0, 0, len,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
-		up_write(&current->mm->mmap_sem);
 
 		if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
 			if (!realdatastart)
 				realdatastart = (unsigned long) -ENOMEM;
 			printk("Unable to allocate RAM for process data, errno %d\n",
 					(int)-realdatastart);
-			do_munmap(current->mm, textpos, text_len);
+			vm_munmap(textpos, text_len);
 			ret = realdatastart;
 			goto err;
 		}
@@ -582,17 +581,17 @@ static int load_flat_file(struct linux_binprm * bprm,
 #ifdef CONFIG_BINFMT_ZFLAT
 		if (flags & FLAT_FLAG_GZDATA) {
 			result = decompress_exec(bprm, fpos, (char *) datapos, 
-						 data_len + (relocs * sizeof(unsigned long)), 0);
+						 full_data, 0);
 		} else
 #endif
 		{
-			result = bprm->file->f_op->read(bprm->file, (char *) datapos,
-					data_len + (relocs * sizeof(unsigned long)), &fpos);
+			result = read_code(bprm->file, datapos, fpos,
+					full_data);
 		}
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read data+bss, errno %d\n", (int)-result);
-			do_munmap(current->mm, textpos, text_len);
-			do_munmap(current->mm, realdatastart, len);
+			vm_munmap(textpos, text_len);
+			vm_munmap(realdatastart, len);
 			ret = result;
 			goto err;
 		}
@@ -604,10 +603,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 		len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
 		len = PAGE_ALIGN(len);
-		down_write(&current->mm->mmap_sem);
-		textpos = do_mmap(0, 0, len,
+		textpos = vm_mmap(0, 0, len,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
-		up_write(&current->mm->mmap_sem);
 
 		if (!textpos || IS_ERR_VALUE(textpos)) {
 			if (!textpos)
@@ -634,34 +631,29 @@ static int load_flat_file(struct linux_binprm * bprm,
 		if (flags & FLAT_FLAG_GZIP) {
 			result = decompress_exec(bprm, sizeof (struct flat_hdr),
 					 (((char *) textpos) + sizeof (struct flat_hdr)),
-					 (text_len + data_len + (relocs * sizeof(unsigned long))
+					 (text_len + full_data
 						  - sizeof (struct flat_hdr)),
 					 0);
 			memmove((void *) datapos, (void *) realdatastart,
-					data_len + (relocs * sizeof(unsigned long)));
+					full_data);
 		} else if (flags & FLAT_FLAG_GZDATA) {
-			fpos = 0;
-			result = bprm->file->f_op->read(bprm->file,
-					(char *) textpos, text_len, &fpos);
+			result = read_code(bprm->file, textpos, 0, text_len);
 			if (!IS_ERR_VALUE(result))
 				result = decompress_exec(bprm, text_len, (char *) datapos,
-						 data_len + (relocs * sizeof(unsigned long)), 0);
+						 full_data, 0);
 		}
 		else
 #endif
 		{
-			fpos = 0;
-			result = bprm->file->f_op->read(bprm->file,
-					(char *) textpos, text_len, &fpos);
-			if (!IS_ERR_VALUE(result)) {
-				fpos = ntohl(hdr->data_start);
-				result = bprm->file->f_op->read(bprm->file, (char *) datapos,
-					data_len + (relocs * sizeof(unsigned long)), &fpos);
-			}
+			result = read_code(bprm->file, textpos, 0, text_len);
+			if (!IS_ERR_VALUE(result))
+				result = read_code(bprm->file, datapos,
+						   ntohl(hdr->data_start),
+						   full_data);
 		}
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read code+data+bss, errno %d\n",(int)-result);
-			do_munmap(current->mm, textpos, text_len + data_len + extra +
+			vm_munmap(textpos, text_len + data_len + extra +
 				MAX_SHARED_LIBS * sizeof(unsigned long));
 			ret = result;
 			goto err;
@@ -717,7 +709,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	 * help simplify all this mumbo jumbo
 	 *
 	 * We've got two different sections of relocation entries.
-	 * The first is the GOT which resides at the begining of the data segment
+	 * The first is the GOT which resides at the beginning of the data segment
 	 * and is terminated with a -1.  This one can be relocated in place.
 	 * The second is the extra relocation entries tacked after the image's
 	 * data segment. These require a little more processing as the entry is
@@ -820,6 +812,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 	int res;
 	char buf[16];
 
+	memset(&bprm, 0, sizeof(bprm));
+
 	/* Create the file name */
 	sprintf(buf, "/lib/lib%d.so", id);
 
@@ -835,6 +829,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 	if (!bprm.cred)
 		goto out;
 
+	/* We don't really care about recalculating credentials at this point
+	 * as we're past the point of no return and are dealing with shared
+	 * libraries.
+	 */
+	bprm.cred_prepared = 1;
+
 	res = prepare_binprm(&bprm);
 
 	if (!IS_ERR_VALUE(res))
@@ -857,9 +857,10 @@ out:
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_flat_binary(struct linux_binprm * bprm)
 {
 	struct lib_info libinfo;
+	struct pt_regs *regs = current_pt_regs();
 	unsigned long p = bprm->p;
 	unsigned long stack_len;
 	unsigned long start_addr;
@@ -894,7 +895,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 						libinfo.lib_list[j].start_data:UNLOADED_LIB;
 
 	install_exec_creds(bprm);
- 	current->flags &= ~PF_FORKNOEXEC;
 
 	set_binfmt(&flat_format);
 
@@ -942,7 +942,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 static int __init init_flat_binfmt(void)
 {
-	return register_binfmt(&flat_format);
+	register_binfmt(&flat_format);
+	return 0;
 }
 
 /****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1befe2ec818..b60500300dd 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -19,9 +19,11 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/magic.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/string_helpers.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
@@ -103,7 +105,7 @@ static Node *check_file(struct linux_binprm *bprm)
 /*
  * the loader itself
  */
-static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_misc_binary(struct linux_binprm *bprm)
 {
 	Node *fmt;
 	struct file * interp_file = NULL;
@@ -116,10 +118,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (!enabled)
 		goto _ret;
 
-	retval = -ENOEXEC;
-	if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
-		goto _ret;
-
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
 	fmt = check_file(bprm);
@@ -149,8 +147,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 		/* if the binary is not readable than enforce mm->dumpable=0
 		   regardless of the interpreter's permissions */
-		if (file_permission(bprm->file, MAY_READ))
-			bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+		would_dump(bprm, bprm->file);
 
 		allow_write_access(bprm->file);
 		bprm->file = NULL;
@@ -176,7 +173,10 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _error;
 	bprm->argc ++;
 
-	bprm->interp = iname;	/* for binfmt_script */
+	/* Update interp in case binfmt_script needs it. */
+	retval = bprm_change_interp(iname, bprm);
+	if (retval < 0)
+		goto _error;
 
 	interp_file = open_exec (iname);
 	retval = PTR_ERR (interp_file);
@@ -197,9 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (retval < 0)
 		goto _error;
 
-	bprm->recursion_depth++;
-
-	retval = search_binary_handler (bprm, regs);
+	retval = search_binary_handler(bprm);
 	if (retval < 0)
 		goto _error;
 
@@ -237,24 +235,6 @@ static char *scanarg(char *s, char del)
 	return s;
 }
 
-static int unquote(char *from)
-{
-	char c = 0, *s = from, *p = from;
-
-	while ((c = *s++) != '\0') {
-		if (c == '\\' && *s == 'x') {
-			s++;
-			c = toupper(*s++);
-			*p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4;
-			c = toupper(*s++);
-			*p++ |= c - (isdigit(c) ? '0' : 'A' - 10);
-			continue;
-		}
-		*p++ = c;
-	}
-	return p - from;
-}
-
 static char * check_special_flags (char * sfs, Node * e)
 {
 	char * p = sfs;
@@ -357,8 +337,9 @@ static Node *create_entry(const char __user *buffer, size_t count)
 		p[-1] = '\0';
 		if (!e->mask[0])
 			e->mask = NULL;
-		e->size = unquote(e->magic);
-		if (e->mask && unquote(e->mask) != e->size)
+		e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
+		if (e->mask &&
+		    string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
 			goto Einval;
 		if (e->size + e->offset > BINPRM_BUF_SIZE)
 			goto Einval;
@@ -505,7 +486,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
 static void bm_evict_inode(struct inode *inode)
 {
-	end_writeback(inode);
+	clear_inode(inode);
 	kfree(inode->i_private);
 }
 
@@ -522,7 +503,7 @@ static void kill_node(Node *e)
 	write_unlock(&entries_lock);
 
 	if (dentry) {
-		dentry->d_inode->i_nlink--;
+		drop_nlink(dentry->d_inode);
 		d_drop(dentry);
 		dput(dentry);
 		simple_release_fs(&bm_mnt, &entry_count);
@@ -534,7 +515,7 @@ static void kill_node(Node *e)
 static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
-	Node *e = file->f_path.dentry->d_inode->i_private;
+	Node *e = file_inode(file)->i_private;
 	ssize_t res;
 	char *page;
 
@@ -553,7 +534,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	struct dentry *root;
-	Node *e = file->f_path.dentry->d_inode->i_private;
+	Node *e = file_inode(file)->i_private;
 	int res = parse_command(buffer, count);
 
 	switch (res) {
@@ -561,7 +542,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 			break;
 		case 2: set_bit(Enabled, &e->flags);
 			break;
-		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
 			kill_node(e);
@@ -588,7 +569,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	Node *e;
 	struct inode *inode;
 	struct dentry *root, *dentry;
-	struct super_block *sb = file->f_path.mnt->mnt_sb;
+	struct super_block *sb = file->f_path.dentry->d_sb;
 	int err = 0;
 
 	e = create_entry(buffer, count);
@@ -667,7 +648,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 	switch (res) {
 		case 1: enabled = 0; break;
 		case 2: enabled = 1; break;
-		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
 			while (!list_empty(&entries))
@@ -675,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
+			break;
 		default: return res;
 	}
 	return count;
@@ -700,7 +682,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
-	int err = simple_fill_super(sb, 0x42494e4d, bm_files);
+	int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
 	if (!err)
 		sb->s_op = &s_ops;
 	return err;
@@ -723,15 +705,13 @@ static struct file_system_type bm_fs_type = {
 	.mount		= bm_mount,
 	.kill_sb	= kill_litter_super,
 };
+MODULE_ALIAS_FS("binfmt_misc");
 
 static int __init init_misc_binfmt(void)
 {
 	int err = register_filesystem(&bm_fs_type);
-	if (!err) {
-		err = insert_binfmt(&misc_format);
-		if (err)
-			unregister_filesystem(&bm_fs_type);
-	}
+	if (!err)
+		insert_binfmt(&misc_format);
 	return err;
 }
 
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 396a9884591..5027a3e1492 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
 
-static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_script(struct linux_binprm *bprm)
 {
 	const char *i_arg, *i_name;
 	char *cp;
@@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	char interp[BINPRM_BUF_SIZE];
 	int retval;
 
-	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
-	    (bprm->recursion_depth > BINPRM_MAX_RECURSION))
+	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
 		return -ENOEXEC;
 	/*
 	 * This section does the #! interpretation.
 	 * Sorta complicated, but hopefully it will work.  -TYT
 	 */
 
-	bprm->recursion_depth++;
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
@@ -82,7 +80,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	retval = copy_strings_kernel(1, &i_name, bprm);
 	if (retval) return retval; 
 	bprm->argc++;
-	bprm->interp = interp;
+	retval = bprm_change_interp(interp, bprm);
+	if (retval < 0)
+		return retval;
 
 	/*
 	 * OK, now restart the process with the interpreter's dentry.
@@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	retval = prepare_binprm(bprm);
 	if (retval < 0)
 		return retval;
-	return search_binary_handler(bprm,regs);
+	return search_binary_handler(bprm);
 }
 
 static struct linux_binfmt script_format = {
@@ -105,7 +105,8 @@ static struct linux_binfmt script_format = {
 
 static int __init init_script_binfmt(void)
 {
-	return register_binfmt(&script_format);
+	register_binfmt(&script_format);
+	return 0;
 }
 
 static void __exit exit_script_binfmt(void)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index cc8560f6c9b..4e00ed68d4a 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
 
 #include <linux/elf.h>
 
-static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs);
+static int load_som_binary(struct linux_binprm * bprm);
 static int load_som_library(struct file *);
 
 /*
@@ -147,10 +147,8 @@ static int map_som_binary(struct file *file,
 	code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
 	current->mm->start_code = code_start;
 	current->mm->end_code = code_start + code_size;
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(file, code_start, code_size, prot,
+	retval = vm_mmap(file, code_start, code_size, prot,
 			flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
-	up_write(&current->mm->mmap_sem);
 	if (retval < 0 && retval > -1024)
 		goto out;
 
@@ -158,20 +156,16 @@ static int map_som_binary(struct file *file,
 	data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
 	current->mm->start_data = data_start;
 	current->mm->end_data = bss_start = data_start + data_size;
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(file, data_start, data_size,
+	retval = vm_mmap(file, data_start, data_size,
 			prot | PROT_WRITE, flags,
 			SOM_PAGESTART(hpuxhdr->exec_dfile));
-	up_write(&current->mm->mmap_sem);
 	if (retval < 0 && retval > -1024)
 		goto out;
 
 	som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
 	current->mm->start_brk = current->mm->brk = som_brk;
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(NULL, bss_start, som_brk - bss_start,
+	retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
 			prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
-	up_write(&current->mm->mmap_sem);
 	if (retval > 0 || retval < -1024)
 		retval = 0;
 out:
@@ -186,13 +180,14 @@ out:
  */
 
 static int
-load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+load_som_binary(struct linux_binprm * bprm)
 {
 	int retval;
 	unsigned int size;
 	unsigned long som_entry;
 	struct som_hdr *som_ex;
 	struct som_exec_auxhdr *hpuxhdr;
+	struct pt_regs *regs = current_pt_regs();
 
 	/* Get the exec-header */
 	som_ex = (struct som_hdr *) bprm->buf;
@@ -225,7 +220,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out_free;
 
 	/* OK, This is the point of no return */
-	current->flags &= ~PF_FORKNOEXEC;
 	current->personality = PER_HPUX;
 	setup_new_exec(bprm);
 
@@ -289,7 +283,8 @@ static int load_som_library(struct file *f)
 
 static int __init init_som_binfmt(void)
 {
-	return register_binfmt(&som_format);
+	register_binfmt(&som_format);
+	return 0;
 }
 
 static void __exit exit_som_binfmt(void)
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
deleted file mode 100644
index e49cce234c6..00000000000
--- a/fs/bio-integrity.c
+++ /dev/null
@@ -1,804 +0,0 @@
-/*
- * bio-integrity.c - bio data integrity extensions
- *
- * Copyright (C) 2007, 2008, 2009 Oracle Corporation
- * Written by: Martin K. Petersen <martin.petersen@oracle.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-
-struct integrity_slab {
-	struct kmem_cache *slab;
-	unsigned short nr_vecs;
-	char name[8];
-};
-
-#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
-struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
-	IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
-};
-#undef IS
-
-static struct workqueue_struct *kintegrityd_wq;
-
-static inline unsigned int vecs_to_idx(unsigned int nr)
-{
-	switch (nr) {
-	case 1:
-		return 0;
-	case 2 ... 4:
-		return 1;
-	case 5 ... 16:
-		return 2;
-	case 17 ... 64:
-		return 3;
-	case 65 ... 128:
-		return 4;
-	case 129 ... BIO_MAX_PAGES:
-		return 5;
-	default:
-		BUG();
-	}
-}
-
-static inline int use_bip_pool(unsigned int idx)
-{
-	if (idx == BIOVEC_MAX_IDX)
-		return 1;
-
-	return 0;
-}
-
-/**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
- * @bio:	bio to attach integrity metadata to
- * @gfp_mask:	Memory allocation mask
- * @nr_vecs:	Number of integrity metadata scatter-gather elements
- * @bs:		bio_set to allocate from
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-							 gfp_t gfp_mask,
-							 unsigned int nr_vecs,
-							 struct bio_set *bs)
-{
-	struct bio_integrity_payload *bip;
-	unsigned int idx = vecs_to_idx(nr_vecs);
-
-	BUG_ON(bio == NULL);
-	bip = NULL;
-
-	/* Lower order allocations come straight from slab */
-	if (!use_bip_pool(idx))
-		bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-
-	/* Use mempool if lower order alloc failed or max vecs were requested */
-	if (bip == NULL) {
-		idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
-		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-
-		if (unlikely(bip == NULL)) {
-			printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-			return NULL;
-		}
-	}
-
-	memset(bip, 0, sizeof(*bip));
-
-	bip->bip_slab = idx;
-	bip->bip_bio = bio;
-	bio->bi_integrity = bip;
-
-	return bip;
-}
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio:	bio to attach integrity metadata to
- * @gfp_mask:	Memory allocation mask
- * @nr_vecs:	Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-						  gfp_t gfp_mask,
-						  unsigned int nr_vecs)
-{
-	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
-EXPORT_SYMBOL(bio_integrity_alloc);
-
-/**
- * bio_integrity_free - Free bio integrity payload
- * @bio:	bio containing bip to be freed
- * @bs:		bio_set this bio was allocated from
- *
- * Description: Used to free the integrity portion of a bio. Usually
- * called from bio_free().
- */
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-
-	BUG_ON(bip == NULL);
-
-	/* A cloned bio doesn't own the integrity metadata */
-	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
-	    && bip->bip_buf != NULL)
-		kfree(bip->bip_buf);
-
-	if (use_bip_pool(bip->bip_slab))
-		mempool_free(bip, bs->bio_integrity_pool);
-	else
-		kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
-
-	bio->bi_integrity = NULL;
-}
-EXPORT_SYMBOL(bio_integrity_free);
-
-/**
- * bio_integrity_add_page - Attach integrity metadata
- * @bio:	bio to update
- * @page:	page containing integrity metadata
- * @len:	number of bytes of integrity metadata in page
- * @offset:	start offset within page
- *
- * Description: Attach a page containing integrity metadata to bio.
- */
-int bio_integrity_add_page(struct bio *bio, struct page *page,
-			   unsigned int len, unsigned int offset)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct bio_vec *iv;
-
-	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
-		printk(KERN_ERR "%s: bip_vec full\n", __func__);
-		return 0;
-	}
-
-	iv = bip_vec_idx(bip, bip->bip_vcnt);
-	BUG_ON(iv == NULL);
-
-	iv->bv_page = page;
-	iv->bv_len = len;
-	iv->bv_offset = offset;
-	bip->bip_vcnt++;
-
-	return len;
-}
-EXPORT_SYMBOL(bio_integrity_add_page);
-
-static int bdev_integrity_enabled(struct block_device *bdev, int rw)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bdev);
-
-	if (bi == NULL)
-		return 0;
-
-	if (rw == READ && bi->verify_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_READ))
-		return 1;
-
-	if (rw == WRITE && bi->generate_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_WRITE))
-		return 1;
-
-	return 0;
-}
-
-/**
- * bio_integrity_enabled - Check whether integrity can be passed
- * @bio:	bio to check
- *
- * Description: Determines whether bio_integrity_prep() can be called
- * on this bio or not.	bio data direction and target device must be
- * set prior to calling.  The functions honors the write_generate and
- * read_verify flags in sysfs.
- */
-int bio_integrity_enabled(struct bio *bio)
-{
-	/* Already protected? */
-	if (bio_integrity(bio))
-		return 0;
-
-	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
-}
-EXPORT_SYMBOL(bio_integrity_enabled);
-
-/**
- * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
- * @bi:		blk_integrity profile for device
- * @sectors:	Number of 512 sectors to convert
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the hardware
- * sector size of the storage device.  Convert the block layer sectors
- * to physical sectors.
- */
-static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
-						    unsigned int sectors)
-{
-	/* At this point there are only 512b or 4096b DIF/EPP devices */
-	if (bi->sector_size == 4096)
-		return sectors >>= 3;
-
-	return sectors;
-}
-
-/**
- * bio_integrity_tag_size - Retrieve integrity tag space
- * @bio:	bio to inspect
- *
- * Description: Returns the maximum number of tag bytes that can be
- * attached to this bio. Filesystems can use this to determine how
- * much metadata to attach to an I/O.
- */
-unsigned int bio_integrity_tag_size(struct bio *bio)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
-	BUG_ON(bio->bi_size == 0);
-
-	return bi->tag_size * (bio->bi_size / bi->sector_size);
-}
-EXPORT_SYMBOL(bio_integrity_tag_size);
-
-int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned int nr_sectors;
-
-	BUG_ON(bip->bip_buf == NULL);
-
-	if (bi->tag_size == 0)
-		return -1;
-
-	nr_sectors = bio_integrity_hw_sectors(bi,
-					DIV_ROUND_UP(len, bi->tag_size));
-
-	if (nr_sectors * bi->tuple_size > bip->bip_size) {
-		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
-		       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
-		return -1;
-	}
-
-	if (set)
-		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
-	else
-		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
-
-	return 0;
-}
-
-/**
- * bio_integrity_set_tag - Attach a tag buffer to a bio
- * @bio:	bio to attach buffer to
- * @tag_buf:	Pointer to a buffer containing tag data
- * @len:	Length of the included buffer
- *
- * Description: Use this function to tag a bio by leveraging the extra
- * space provided by devices formatted with integrity protection.  The
- * size of the integrity buffer must be <= to the size reported by
- * bio_integrity_tag_size().
- */
-int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
-	BUG_ON(bio_data_dir(bio) != WRITE);
-
-	return bio_integrity_tag(bio, tag_buf, len, 1);
-}
-EXPORT_SYMBOL(bio_integrity_set_tag);
-
-/**
- * bio_integrity_get_tag - Retrieve a tag buffer from a bio
- * @bio:	bio to retrieve buffer from
- * @tag_buf:	Pointer to a buffer for the tag data
- * @len:	Length of the target buffer
- *
- * Description: Use this function to retrieve the tag buffer from a
- * completed I/O. The size of the integrity buffer must be <= to the
- * size reported by bio_integrity_tag_size().
- */
-int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
-	BUG_ON(bio_data_dir(bio) != READ);
-
-	return bio_integrity_tag(bio, tag_buf, len, 0);
-}
-EXPORT_SYMBOL(bio_integrity_get_tag);
-
-/**
- * bio_integrity_generate - Generate integrity metadata for a bio
- * @bio:	bio to generate integrity metadata for
- *
- * Description: Generates integrity metadata for a bio by calling the
- * block device's generation callback function.  The bio must have a
- * bip attached with enough room to accommodate the generated
- * integrity metadata.
- */
-static void bio_integrity_generate(struct bio *bio)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	struct blk_integrity_exchg bix;
-	struct bio_vec *bv;
-	sector_t sector = bio->bi_sector;
-	unsigned int i, sectors, total;
-	void *prot_buf = bio->bi_integrity->bip_buf;
-
-	total = 0;
-	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
-	bix.sector_size = bi->sector_size;
-
-	bio_for_each_segment(bv, bio, i) {
-		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
-		bix.data_buf = kaddr + bv->bv_offset;
-		bix.data_size = bv->bv_len;
-		bix.prot_buf = prot_buf;
-		bix.sector = sector;
-
-		bi->generate_fn(&bix);
-
-		sectors = bv->bv_len / bi->sector_size;
-		sector += sectors;
-		prot_buf += sectors * bi->tuple_size;
-		total += sectors * bi->tuple_size;
-		BUG_ON(total > bio->bi_integrity->bip_size);
-
-		kunmap_atomic(kaddr, KM_USER0);
-	}
-}
-
-static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
-{
-	if (bi)
-		return bi->tuple_size;
-
-	return 0;
-}
-
-/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio:	bio to prepare
- *
- * Description: Allocates a buffer for integrity metadata, maps the
- * pages and attaches them to a bio.  The bio must have data
- * direction, target device and start sector set priot to calling.  In
- * the WRITE case, integrity metadata will be generated using the
- * block device's integrity function.  In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-int bio_integrity_prep(struct bio *bio)
-{
-	struct bio_integrity_payload *bip;
-	struct blk_integrity *bi;
-	struct request_queue *q;
-	void *buf;
-	unsigned long start, end;
-	unsigned int len, nr_pages;
-	unsigned int bytes, offset, i;
-	unsigned int sectors;
-
-	bi = bdev_get_integrity(bio->bi_bdev);
-	q = bdev_get_queue(bio->bi_bdev);
-	BUG_ON(bi == NULL);
-	BUG_ON(bio_integrity(bio));
-
-	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
-
-	/* Allocate kernel buffer for protection data */
-	len = sectors * blk_integrity_tuple_size(bi);
-	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
-	if (unlikely(buf == NULL)) {
-		printk(KERN_ERR "could not allocate integrity buffer\n");
-		return -ENOMEM;
-	}
-
-	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	start = ((unsigned long) buf) >> PAGE_SHIFT;
-	nr_pages = end - start;
-
-	/* Allocate bio integrity payload and integrity vectors */
-	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
-	if (unlikely(bip == NULL)) {
-		printk(KERN_ERR "could not allocate data integrity bioset\n");
-		kfree(buf);
-		return -EIO;
-	}
-
-	bip->bip_buf = buf;
-	bip->bip_size = len;
-	bip->bip_sector = bio->bi_sector;
-
-	/* Map it */
-	offset = offset_in_page(buf);
-	for (i = 0 ; i < nr_pages ; i++) {
-		int ret;
-		bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
-
-		ret = bio_integrity_add_page(bio, virt_to_page(buf),
-					     bytes, offset);
-
-		if (ret == 0)
-			return 0;
-
-		if (ret < bytes)
-			break;
-
-		buf += bytes;
-		len -= bytes;
-		offset = 0;
-	}
-
-	/* Install custom I/O completion handler if read verify is enabled */
-	if (bio_data_dir(bio) == READ) {
-		bip->bip_end_io = bio->bi_end_io;
-		bio->bi_end_io = bio_integrity_endio;
-	}
-
-	/* Auto-generate integrity metadata if this is a write */
-	if (bio_data_dir(bio) == WRITE)
-		bio_integrity_generate(bio);
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify - Verify integrity metadata for a bio
- * @bio:	bio to verify
- *
- * Description: This function is called to verify the integrity of a
- * bio.	 The data in the bio io_vec is compared to the integrity
- * metadata returned by the HBA.
- */
-static int bio_integrity_verify(struct bio *bio)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	struct blk_integrity_exchg bix;
-	struct bio_vec *bv;
-	sector_t sector = bio->bi_integrity->bip_sector;
-	unsigned int i, sectors, total, ret;
-	void *prot_buf = bio->bi_integrity->bip_buf;
-
-	ret = total = 0;
-	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
-	bix.sector_size = bi->sector_size;
-
-	bio_for_each_segment(bv, bio, i) {
-		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
-		bix.data_buf = kaddr + bv->bv_offset;
-		bix.data_size = bv->bv_len;
-		bix.prot_buf = prot_buf;
-		bix.sector = sector;
-
-		ret = bi->verify_fn(&bix);
-
-		if (ret) {
-			kunmap_atomic(kaddr, KM_USER0);
-			return ret;
-		}
-
-		sectors = bv->bv_len / bi->sector_size;
-		sector += sectors;
-		prot_buf += sectors * bi->tuple_size;
-		total += sectors * bi->tuple_size;
-		BUG_ON(total > bio->bi_integrity->bip_size);
-
-		kunmap_atomic(kaddr, KM_USER0);
-	}
-
-	return ret;
-}
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work:	Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request.  The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
-	struct bio_integrity_payload *bip =
-		container_of(work, struct bio_integrity_payload, bip_work);
-	struct bio *bio = bip->bip_bio;
-	int error;
-
-	error = bio_integrity_verify(bio);
-
-	/* Restore original bio completion handler */
-	bio->bi_end_io = bip->bip_end_io;
-	bio_endio(bio, error);
-}
-
-/**
- * bio_integrity_endio - Integrity I/O completion function
- * @bio:	Protected bio
- * @error:	Pointer to errno
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context.  However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context.	This function postpones completion
- * accordingly.
- */
-void bio_integrity_endio(struct bio *bio, int error)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-
-	BUG_ON(bip->bip_bio != bio);
-
-	/* In case of an I/O error there is no point in verifying the
-	 * integrity metadata.  Restore original bio end_io handler
-	 * and run it.
-	 */
-	if (error) {
-		bio->bi_end_io = bip->bip_end_io;
-		bio_endio(bio, error);
-
-		return;
-	}
-
-	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
-	queue_work(kintegrityd_wq, &bip->bip_work);
-}
-EXPORT_SYMBOL(bio_integrity_endio);
-
-/**
- * bio_integrity_mark_head - Advance bip_vec skip bytes
- * @bip:	Integrity vector to advance
- * @skip:	Number of bytes to advance it
- */
-void bio_integrity_mark_head(struct bio_integrity_payload *bip,
-			     unsigned int skip)
-{
-	struct bio_vec *iv;
-	unsigned int i;
-
-	bip_for_each_vec(iv, bip, i) {
-		if (skip == 0) {
-			bip->bip_idx = i;
-			return;
-		} else if (skip >= iv->bv_len) {
-			skip -= iv->bv_len;
-		} else { /* skip < iv->bv_len) */
-			iv->bv_offset += skip;
-			iv->bv_len -= skip;
-			bip->bip_idx = i;
-			return;
-		}
-	}
-}
-
-/**
- * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
- * @bip:	Integrity vector to truncate
- * @len:	New length of integrity vector
- */
-void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
-			     unsigned int len)
-{
-	struct bio_vec *iv;
-	unsigned int i;
-
-	bip_for_each_vec(iv, bip, i) {
-		if (len == 0) {
-			bip->bip_vcnt = i;
-			return;
-		} else if (len >= iv->bv_len) {
-			len -= iv->bv_len;
-		} else { /* len < iv->bv_len) */
-			iv->bv_len = len;
-			len = 0;
-		}
-	}
-}
-
-/**
- * bio_integrity_advance - Advance integrity vector
- * @bio:	bio whose integrity vector to update
- * @bytes_done:	number of data bytes that have been completed
- *
- * Description: This function calculates how many integrity bytes the
- * number of completed data bytes correspond to and advances the
- * integrity vector accordingly.
- */
-void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned int nr_sectors;
-
-	BUG_ON(bip == NULL);
-	BUG_ON(bi == NULL);
-
-	nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
-	bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
-}
-EXPORT_SYMBOL(bio_integrity_advance);
-
-/**
- * bio_integrity_trim - Trim integrity vector
- * @bio:	bio whose integrity vector to update
- * @offset:	offset to first data sector
- * @sectors:	number of data sectors
- *
- * Description: Used to trim the integrity vector in a cloned bio.
- * The ivec will be advanced corresponding to 'offset' data sectors
- * and the length will be truncated corresponding to 'len' data
- * sectors.
- */
-void bio_integrity_trim(struct bio *bio, unsigned int offset,
-			unsigned int sectors)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned int nr_sectors;
-
-	BUG_ON(bip == NULL);
-	BUG_ON(bi == NULL);
-	BUG_ON(!bio_flagged(bio, BIO_CLONED));
-
-	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-	bip->bip_sector = bip->bip_sector + offset;
-	bio_integrity_mark_head(bip, offset * bi->tuple_size);
-	bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
-}
-EXPORT_SYMBOL(bio_integrity_trim);
-
-/**
- * bio_integrity_split - Split integrity metadata
- * @bio:	Protected bio
- * @bp:		Resulting bio_pair
- * @sectors:	Offset
- *
- * Description: Splits an integrity page into a bio_pair.
- */
-void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
-{
-	struct blk_integrity *bi;
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	unsigned int nr_sectors;
-
-	if (bio_integrity(bio) == 0)
-		return;
-
-	bi = bdev_get_integrity(bio->bi_bdev);
-	BUG_ON(bi == NULL);
-	BUG_ON(bip->bip_vcnt != 1);
-
-	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-
-	bp->bio1.bi_integrity = &bp->bip1;
-	bp->bio2.bi_integrity = &bp->bip2;
-
-	bp->iv1 = bip->bip_vec[0];
-	bp->iv2 = bip->bip_vec[0];
-
-	bp->bip1.bip_vec[0] = bp->iv1;
-	bp->bip2.bip_vec[0] = bp->iv2;
-
-	bp->iv1.bv_len = sectors * bi->tuple_size;
-	bp->iv2.bv_offset += sectors * bi->tuple_size;
-	bp->iv2.bv_len -= sectors * bi->tuple_size;
-
-	bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
-	bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
-
-	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
-	bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
-}
-EXPORT_SYMBOL(bio_integrity_split);
-
-/**
- * bio_integrity_clone - Callback for cloning bios with integrity metadata
- * @bio:	New bio
- * @bio_src:	Original bio
- * @gfp_mask:	Memory allocation mask
- * @bs:		bio_set to allocate bip from
- *
- * Description:	Called to allocate a bip when cloning a bio
- */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-			gfp_t gfp_mask, struct bio_set *bs)
-{
-	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
-	struct bio_integrity_payload *bip;
-
-	BUG_ON(bip_src == NULL);
-
-	bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
-
-	if (bip == NULL)
-		return -EIO;
-
-	memcpy(bip->bip_vec, bip_src->bip_vec,
-	       bip_src->bip_vcnt * sizeof(struct bio_vec));
-
-	bip->bip_sector = bip_src->bip_sector;
-	bip->bip_vcnt = bip_src->bip_vcnt;
-	bip->bip_idx = bip_src->bip_idx;
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_integrity_clone);
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
-	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
-
-	bs->bio_integrity_pool =
-		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
-
-	if (!bs->bio_integrity_pool)
-		return -1;
-
-	return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
-
-void __init bio_integrity_init(void)
-{
-	unsigned int i;
-
-	/*
-	 * kintegrityd won't block much but may burn a lot of CPU cycles.
-	 * Make it highpri CPU intensive wq with max concurrency of 1.
-	 */
-	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
-					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
-	if (!kintegrityd_wq)
-		panic("Failed to create kintegrityd\n");
-
-	for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-		unsigned int size;
-
-		size = sizeof(struct bio_integrity_payload)
-			+ bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-
-		bip_slab[i].slab =
-			kmem_cache_create(bip_slab[i].name, size, 0,
-					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	}
-}
diff --git a/fs/bio.c b/fs/bio.c
deleted file mode 100644
index 4cf2a52fbc5..00000000000
--- a/fs/bio.c
+++ /dev/null
@@ -1,1694 +0,0 @@
-/*
- * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public Licens
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
- *
- */
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mempool.h>
-#include <linux/workqueue.h>
-#include <scsi/sg.h>		/* for struct sg_iovec */
-
-#include <trace/events/block.h>
-
-/*
- * Test patch to inline a certain number of bi_io_vec's inside the bio
- * itself, to shrink a bio data allocation from two mempool calls to one
- */
-#define BIO_INLINE_VECS		4
-
-static mempool_t *bio_split_pool __read_mostly;
-
-/*
- * if you change this list, also change bvec_alloc or things will
- * break badly! cannot be bigger than what you can fit into an
- * unsigned short
- */
-#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
-	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
-};
-#undef BV
-
-/*
- * fs_bio_set is the bio_set containing bio and iovec memory pools used by
- * IO code that does not need private memory pools.
- */
-struct bio_set *fs_bio_set;
-
-/*
- * Our slab pool management
- */
-struct bio_slab {
-	struct kmem_cache *slab;
-	unsigned int slab_ref;
-	unsigned int slab_size;
-	char name[8];
-};
-static DEFINE_MUTEX(bio_slab_lock);
-static struct bio_slab *bio_slabs;
-static unsigned int bio_slab_nr, bio_slab_max;
-
-static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
-{
-	unsigned int sz = sizeof(struct bio) + extra_size;
-	struct kmem_cache *slab = NULL;
-	struct bio_slab *bslab;
-	unsigned int i, entry = -1;
-
-	mutex_lock(&bio_slab_lock);
-
-	i = 0;
-	while (i < bio_slab_nr) {
-		bslab = &bio_slabs[i];
-
-		if (!bslab->slab && entry == -1)
-			entry = i;
-		else if (bslab->slab_size == sz) {
-			slab = bslab->slab;
-			bslab->slab_ref++;
-			break;
-		}
-		i++;
-	}
-
-	if (slab)
-		goto out_unlock;
-
-	if (bio_slab_nr == bio_slab_max && entry == -1) {
-		bio_slab_max <<= 1;
-		bio_slabs = krealloc(bio_slabs,
-				     bio_slab_max * sizeof(struct bio_slab),
-				     GFP_KERNEL);
-		if (!bio_slabs)
-			goto out_unlock;
-	}
-	if (entry == -1)
-		entry = bio_slab_nr++;
-
-	bslab = &bio_slabs[entry];
-
-	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!slab)
-		goto out_unlock;
-
-	printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
-	bslab->slab = slab;
-	bslab->slab_ref = 1;
-	bslab->slab_size = sz;
-out_unlock:
-	mutex_unlock(&bio_slab_lock);
-	return slab;
-}
-
-static void bio_put_slab(struct bio_set *bs)
-{
-	struct bio_slab *bslab = NULL;
-	unsigned int i;
-
-	mutex_lock(&bio_slab_lock);
-
-	for (i = 0; i < bio_slab_nr; i++) {
-		if (bs->bio_slab == bio_slabs[i].slab) {
-			bslab = &bio_slabs[i];
-			break;
-		}
-	}
-
-	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
-		goto out;
-
-	WARN_ON(!bslab->slab_ref);
-
-	if (--bslab->slab_ref)
-		goto out;
-
-	kmem_cache_destroy(bslab->slab);
-	bslab->slab = NULL;
-
-out:
-	mutex_unlock(&bio_slab_lock);
-}
-
-unsigned int bvec_nr_vecs(unsigned short idx)
-{
-	return bvec_slabs[idx].nr_vecs;
-}
-
-void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
-{
-	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
-
-	if (idx == BIOVEC_MAX_IDX)
-		mempool_free(bv, bs->bvec_pool);
-	else {
-		struct biovec_slab *bvs = bvec_slabs + idx;
-
-		kmem_cache_free(bvs->slab, bv);
-	}
-}
-
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-			      struct bio_set *bs)
-{
-	struct bio_vec *bvl;
-
-	/*
-	 * see comment near bvec_array define!
-	 */
-	switch (nr) {
-	case 1:
-		*idx = 0;
-		break;
-	case 2 ... 4:
-		*idx = 1;
-		break;
-	case 5 ... 16:
-		*idx = 2;
-		break;
-	case 17 ... 64:
-		*idx = 3;
-		break;
-	case 65 ... 128:
-		*idx = 4;
-		break;
-	case 129 ... BIO_MAX_PAGES:
-		*idx = 5;
-		break;
-	default:
-		return NULL;
-	}
-
-	/*
-	 * idx now points to the pool we want to allocate from. only the
-	 * 1-vec entry pool is mempool backed.
-	 */
-	if (*idx == BIOVEC_MAX_IDX) {
-fallback:
-		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
-	} else {
-		struct biovec_slab *bvs = bvec_slabs + *idx;
-		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
-
-		/*
-		 * Make this allocation restricted and don't dump info on
-		 * allocation failures, since we'll fallback to the mempool
-		 * in case of failure.
-		 */
-		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-
-		/*
-		 * Try a slab allocation. If this fails and __GFP_WAIT
-		 * is set, retry with the 1-entry mempool
-		 */
-		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
-			*idx = BIOVEC_MAX_IDX;
-			goto fallback;
-		}
-	}
-
-	return bvl;
-}
-
-void bio_free(struct bio *bio, struct bio_set *bs)
-{
-	void *p;
-
-	if (bio_has_allocated_vec(bio))
-		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
-
-	if (bio_integrity(bio))
-		bio_integrity_free(bio, bs);
-
-	/*
-	 * If we have front padding, adjust the bio pointer before freeing
-	 */
-	p = bio;
-	if (bs->front_pad)
-		p -= bs->front_pad;
-
-	mempool_free(p, bs->bio_pool);
-}
-EXPORT_SYMBOL(bio_free);
-
-void bio_init(struct bio *bio)
-{
-	memset(bio, 0, sizeof(*bio));
-	bio->bi_flags = 1 << BIO_UPTODATE;
-	bio->bi_comp_cpu = -1;
-	atomic_set(&bio->bi_cnt, 1);
-}
-EXPORT_SYMBOL(bio_init);
-
-/**
- * bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask:   the GFP_ mask given to the slab allocator
- * @nr_iovecs:	number of iovecs to pre-allocate
- * @bs:		the bio_set to allocate from.
- *
- * Description:
- *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
- *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
- *
- *   Note that the caller must set ->bi_destructor on successful return
- *   of a bio, to do the appropriate freeing of the bio once the reference
- *   count drops to zero.
- **/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
-{
-	unsigned long idx = BIO_POOL_NONE;
-	struct bio_vec *bvl = NULL;
-	struct bio *bio;
-	void *p;
-
-	p = mempool_alloc(bs->bio_pool, gfp_mask);
-	if (unlikely(!p))
-		return NULL;
-	bio = p + bs->front_pad;
-
-	bio_init(bio);
-
-	if (unlikely(!nr_iovecs))
-		goto out_set;
-
-	if (nr_iovecs <= BIO_INLINE_VECS) {
-		bvl = bio->bi_inline_vecs;
-		nr_iovecs = BIO_INLINE_VECS;
-	} else {
-		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
-		if (unlikely(!bvl))
-			goto err_free;
-
-		nr_iovecs = bvec_nr_vecs(idx);
-	}
-out_set:
-	bio->bi_flags |= idx << BIO_POOL_OFFSET;
-	bio->bi_max_vecs = nr_iovecs;
-	bio->bi_io_vec = bvl;
-	return bio;
-
-err_free:
-	mempool_free(p, bs->bio_pool);
-	return NULL;
-}
-EXPORT_SYMBOL(bio_alloc_bioset);
-
-static void bio_fs_destructor(struct bio *bio)
-{
-	bio_free(bio, fs_bio_set);
-}
-
-/**
- *	bio_alloc - allocate a new bio, memory pool backed
- *	@gfp_mask: allocation mask to use
- *	@nr_iovecs: number of iovecs
- *
- *	bio_alloc will allocate a bio and associated bio_vec array that can hold
- *	at least @nr_iovecs entries. Allocations will be done from the
- *	fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
- *
- *	If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- *	a bio. This is due to the mempool guarantees. To make this work, callers
- *	must never allocate more than 1 bio at a time from this pool. Callers
- *	that need to allocate more than 1 bio must always submit the previously
- *	allocated bio for IO before attempting to allocate a new one. Failure to
- *	do so can cause livelocks under memory pressure.
- *
- *	RETURNS:
- *	Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
-{
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
-	if (bio)
-		bio->bi_destructor = bio_fs_destructor;
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_alloc);
-
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-	if (bio_integrity(bio))
-		bio_integrity_free(bio, fs_bio_set);
-	kfree(bio);
-}
-
-/**
- * bio_kmalloc - allocate a bio for I/O using kmalloc()
- * @gfp_mask:   the GFP_ mask given to the slab allocator
- * @nr_iovecs:	number of iovecs to pre-allocate
- *
- * Description:
- *   Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask contains
- *   %__GFP_WAIT, the allocation is guaranteed to succeed.
- *
- **/
-struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
-{
-	struct bio *bio;
-
-	if (nr_iovecs > UIO_MAXIOV)
-		return NULL;
-
-	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
-		      gfp_mask);
-	if (unlikely(!bio))
-		return NULL;
-
-	bio_init(bio);
-	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
-	bio->bi_max_vecs = nr_iovecs;
-	bio->bi_io_vec = bio->bi_inline_vecs;
-	bio->bi_destructor = bio_kmalloc_destructor;
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_kmalloc);
-
-void zero_fill_bio(struct bio *bio)
-{
-	unsigned long flags;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment(bv, bio, i) {
-		char *data = bvec_kmap_irq(bv, &flags);
-		memset(data, 0, bv->bv_len);
-		flush_dcache_page(bv->bv_page);
-		bvec_kunmap_irq(data, &flags);
-	}
-}
-EXPORT_SYMBOL(zero_fill_bio);
-
-/**
- * bio_put - release a reference to a bio
- * @bio:   bio to release reference to
- *
- * Description:
- *   Put a reference to a &struct bio, either one you have gotten with
- *   bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
- **/
-void bio_put(struct bio *bio)
-{
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt)) {
-		bio->bi_next = NULL;
-		bio->bi_destructor(bio);
-	}
-}
-EXPORT_SYMBOL(bio_put);
-
-inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-		blk_recount_segments(q, bio);
-
-	return bio->bi_phys_segments;
-}
-EXPORT_SYMBOL(bio_phys_segments);
-
-/**
- * 	__bio_clone	-	clone a bio
- * 	@bio: destination bio
- * 	@bio_src: bio to clone
- *
- *	Clone a &bio. Caller will own the returned bio, but not
- *	the actual data it points to. Reference count of returned
- * 	bio will be one.
- */
-void __bio_clone(struct bio *bio, struct bio *bio_src)
-{
-	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
-		bio_src->bi_max_vecs * sizeof(struct bio_vec));
-
-	/*
-	 * most users will be overriding ->bi_bdev with a new target,
-	 * so we don't set nor calculate new physical/hw segment counts here
-	 */
-	bio->bi_sector = bio_src->bi_sector;
-	bio->bi_bdev = bio_src->bi_bdev;
-	bio->bi_flags |= 1 << BIO_CLONED;
-	bio->bi_rw = bio_src->bi_rw;
-	bio->bi_vcnt = bio_src->bi_vcnt;
-	bio->bi_size = bio_src->bi_size;
-	bio->bi_idx = bio_src->bi_idx;
-}
-EXPORT_SYMBOL(__bio_clone);
-
-/**
- *	bio_clone	-	clone a bio
- *	@bio: bio to clone
- *	@gfp_mask: allocation priority
- *
- * 	Like __bio_clone, only also allocates the returned bio
- */
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
-{
-	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-
-	if (!b)
-		return NULL;
-
-	b->bi_destructor = bio_fs_destructor;
-	__bio_clone(b, bio);
-
-	if (bio_integrity(bio)) {
-		int ret;
-
-		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
-
-		if (ret < 0) {
-			bio_put(b);
-			return NULL;
-		}
-	}
-
-	return b;
-}
-EXPORT_SYMBOL(bio_clone);
-
-/**
- *	bio_get_nr_vecs		- return approx number of vecs
- *	@bdev:  I/O target
- *
- *	Return the approximate number of pages we can send to this target.
- *	There's no guarantee that you will be able to fit this number of pages
- *	into a bio, it does not account for dynamic restrictions that vary
- *	on offset.
- */
-int bio_get_nr_vecs(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-	int nr_pages;
-
-	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > queue_max_segments(q))
-		nr_pages = queue_max_segments(q);
-
-	return nr_pages;
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
-			  *page, unsigned int len, unsigned int offset,
-			  unsigned short max_sectors)
-{
-	int retried_segments = 0;
-	struct bio_vec *bvec;
-
-	/*
-	 * cloned bio must not modify vec list
-	 */
-	if (unlikely(bio_flagged(bio, BIO_CLONED)))
-		return 0;
-
-	if (((bio->bi_size + len) >> 9) > max_sectors)
-		return 0;
-
-	/*
-	 * For filesystems with a blocksize smaller than the pagesize
-	 * we will often be called with the same page as last time and
-	 * a consecutive offset.  Optimize this special case.
-	 */
-	if (bio->bi_vcnt > 0) {
-		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-		if (page == prev->bv_page &&
-		    offset == prev->bv_offset + prev->bv_len) {
-			unsigned int prev_bv_len = prev->bv_len;
-			prev->bv_len += len;
-
-			if (q->merge_bvec_fn) {
-				struct bvec_merge_data bvm = {
-					/* prev_bvec is already charged in
-					   bi_size, discharge it in order to
-					   simulate merging updated prev_bvec
-					   as new bvec. */
-					.bi_bdev = bio->bi_bdev,
-					.bi_sector = bio->bi_sector,
-					.bi_size = bio->bi_size - prev_bv_len,
-					.bi_rw = bio->bi_rw,
-				};
-
-				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
-					prev->bv_len -= len;
-					return 0;
-				}
-			}
-
-			goto done;
-		}
-	}
-
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return 0;
-
-	/*
-	 * we might lose a segment or two here, but rather that than
-	 * make this too complex.
-	 */
-
-	while (bio->bi_phys_segments >= queue_max_segments(q)) {
-
-		if (retried_segments)
-			return 0;
-
-		retried_segments = 1;
-		blk_recount_segments(q, bio);
-	}
-
-	/*
-	 * setup the new entry, we might clear it again later if we
-	 * cannot add the page
-	 */
-	bvec = &bio->bi_io_vec[bio->bi_vcnt];
-	bvec->bv_page = page;
-	bvec->bv_len = len;
-	bvec->bv_offset = offset;
-
-	/*
-	 * if queue has other restrictions (eg varying max sector size
-	 * depending on offset), it can specify a merge_bvec_fn in the
-	 * queue to get further control
-	 */
-	if (q->merge_bvec_fn) {
-		struct bvec_merge_data bvm = {
-			.bi_bdev = bio->bi_bdev,
-			.bi_sector = bio->bi_sector,
-			.bi_size = bio->bi_size,
-			.bi_rw = bio->bi_rw,
-		};
-
-		/*
-		 * merge_bvec_fn() returns number of bytes it can accept
-		 * at this offset
-		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
-			bvec->bv_page = NULL;
-			bvec->bv_len = 0;
-			bvec->bv_offset = 0;
-			return 0;
-		}
-	}
-
-	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
-		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
-
-	bio->bi_vcnt++;
-	bio->bi_phys_segments++;
- done:
-	bio->bi_size += len;
-	return len;
-}
-
-/**
- *	bio_add_pc_page	-	attempt to add page to bio
- *	@q: the target queue
- *	@bio: destination bio
- *	@page: page to add
- *	@len: vec entry length
- *	@offset: vec entry offset
- *
- *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block
- *	device limitations. The target block device must allow bio's
- *      smaller than PAGE_SIZE, so it is always possible to add a single
- *      page to an empty bio. This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
-		    unsigned int len, unsigned int offset)
-{
-	return __bio_add_page(q, bio, page, len, offset,
-			      queue_max_hw_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_pc_page);
-
-/**
- *	bio_add_page	-	attempt to add page to bio
- *	@bio: destination bio
- *	@page: page to add
- *	@len: vec entry length
- *	@offset: vec entry offset
- *
- *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block
- *	device limitations. The target block device must allow bio's
- *      smaller than PAGE_SIZE, so it is always possible to add a single
- *      page to an empty bio.
- */
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
-		 unsigned int offset)
-{
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_page);
-
-struct bio_map_data {
-	struct bio_vec *iovecs;
-	struct sg_iovec *sgvecs;
-	int nr_sgvecs;
-	int is_our_pages;
-};
-
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     struct sg_iovec *iov, int iov_count,
-			     int is_our_pages)
-{
-	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
-	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
-	bmd->nr_sgvecs = iov_count;
-	bmd->is_our_pages = is_our_pages;
-	bio->bi_private = bmd;
-}
-
-static void bio_free_map_data(struct bio_map_data *bmd)
-{
-	kfree(bmd->iovecs);
-	kfree(bmd->sgvecs);
-	kfree(bmd);
-}
-
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
-					       gfp_t gfp_mask)
-{
-	struct bio_map_data *bmd;
-
-	if (iov_count > UIO_MAXIOV)
-		return NULL;
-
-	bmd = kmalloc(sizeof(*bmd), gfp_mask);
-	if (!bmd)
-		return NULL;
-
-	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
-	if (!bmd->iovecs) {
-		kfree(bmd);
-		return NULL;
-	}
-
-	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
-	if (bmd->sgvecs)
-		return bmd;
-
-	kfree(bmd->iovecs);
-	kfree(bmd);
-	return NULL;
-}
-
-static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-			  struct sg_iovec *iov, int iov_count,
-			  int to_user, int from_user, int do_free_page)
-{
-	int ret = 0, i;
-	struct bio_vec *bvec;
-	int iov_idx = 0;
-	unsigned int iov_off = 0;
-
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		char *bv_addr = page_address(bvec->bv_page);
-		unsigned int bv_len = iovecs[i].bv_len;
-
-		while (bv_len && iov_idx < iov_count) {
-			unsigned int bytes;
-			char __user *iov_addr;
-
-			bytes = min_t(unsigned int,
-				      iov[iov_idx].iov_len - iov_off, bv_len);
-			iov_addr = iov[iov_idx].iov_base + iov_off;
-
-			if (!ret) {
-				if (to_user)
-					ret = copy_to_user(iov_addr, bv_addr,
-							   bytes);
-
-				if (from_user)
-					ret = copy_from_user(bv_addr, iov_addr,
-							     bytes);
-
-				if (ret)
-					ret = -EFAULT;
-			}
-
-			bv_len -= bytes;
-			bv_addr += bytes;
-			iov_addr += bytes;
-			iov_off += bytes;
-
-			if (iov[iov_idx].iov_len == iov_off) {
-				iov_idx++;
-				iov_off = 0;
-			}
-		}
-
-		if (do_free_page)
-			__free_page(bvec->bv_page);
-	}
-
-	return ret;
-}
-
-/**
- *	bio_uncopy_user	-	finish previously mapped bio
- *	@bio: bio being terminated
- *
- *	Free pages allocated from bio_copy_user() and write back data
- *	to user space in case of a read.
- */
-int bio_uncopy_user(struct bio *bio)
-{
-	struct bio_map_data *bmd = bio->bi_private;
-	int ret = 0;
-
-	if (!bio_flagged(bio, BIO_NULL_MAPPED))
-		ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
-				     bmd->nr_sgvecs, bio_data_dir(bio) == READ,
-				     0, bmd->is_our_pages);
-	bio_free_map_data(bmd);
-	bio_put(bio);
-	return ret;
-}
-EXPORT_SYMBOL(bio_uncopy_user);
-
-/**
- *	bio_copy_user_iov	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Prepares and returns a bio for indirect user io, bouncing data
- *	to/from kernel pages as necessary. Must be paired with
- *	call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user_iov(struct request_queue *q,
-			      struct rq_map_data *map_data,
-			      struct sg_iovec *iov, int iov_count,
-			      int write_to_vm, gfp_t gfp_mask)
-{
-	struct bio_map_data *bmd;
-	struct bio_vec *bvec;
-	struct page *page;
-	struct bio *bio;
-	int i, ret;
-	int nr_pages = 0;
-	unsigned int len = 0;
-	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
-
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr;
-		unsigned long end;
-		unsigned long start;
-
-		uaddr = (unsigned long)iov[i].iov_base;
-		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		start = uaddr >> PAGE_SHIFT;
-
-		/*
-		 * Overflow, abort
-		 */
-		if (end < start)
-			return ERR_PTR(-EINVAL);
-
-		nr_pages += end - start;
-		len += iov[i].iov_len;
-	}
-
-	if (offset)
-		nr_pages++;
-
-	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
-	if (!bmd)
-		return ERR_PTR(-ENOMEM);
-
-	ret = -ENOMEM;
-	bio = bio_kmalloc(gfp_mask, nr_pages);
-	if (!bio)
-		goto out_bmd;
-
-	if (!write_to_vm)
-		bio->bi_rw |= REQ_WRITE;
-
-	ret = 0;
-
-	if (map_data) {
-		nr_pages = 1 << map_data->page_order;
-		i = map_data->offset / PAGE_SIZE;
-	}
-	while (len) {
-		unsigned int bytes = PAGE_SIZE;
-
-		bytes -= offset;
-
-		if (bytes > len)
-			bytes = len;
-
-		if (map_data) {
-			if (i == map_data->nr_entries * nr_pages) {
-				ret = -ENOMEM;
-				break;
-			}
-
-			page = map_data->pages[i / nr_pages];
-			page += (i % nr_pages);
-
-			i++;
-		} else {
-			page = alloc_page(q->bounce_gfp | gfp_mask);
-			if (!page) {
-				ret = -ENOMEM;
-				break;
-			}
-		}
-
-		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
-			break;
-
-		len -= bytes;
-		offset = 0;
-	}
-
-	if (ret)
-		goto cleanup;
-
-	/*
-	 * success
-	 */
-	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
-	    (map_data && map_data->from_user)) {
-		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
-		if (ret)
-			goto cleanup;
-	}
-
-	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
-	return bio;
-cleanup:
-	if (!map_data)
-		bio_for_each_segment(bvec, bio, i)
-			__free_page(bvec->bv_page);
-
-	bio_put(bio);
-out_bmd:
-	bio_free_map_data(bmd);
-	return ERR_PTR(ret);
-}
-
-/**
- *	bio_copy_user	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Prepares and returns a bio for indirect user io, bouncing data
- *	to/from kernel pages as necessary. Must be paired with
- *	call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
-			  unsigned long uaddr, unsigned int len,
-			  int write_to_vm, gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
-static struct bio *__bio_map_user_iov(struct request_queue *q,
-				      struct block_device *bdev,
-				      struct sg_iovec *iov, int iov_count,
-				      int write_to_vm, gfp_t gfp_mask)
-{
-	int i, j;
-	int nr_pages = 0;
-	struct page **pages;
-	struct bio *bio;
-	int cur_page = 0;
-	int ret, offset;
-
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
-		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long start = uaddr >> PAGE_SHIFT;
-
-		/*
-		 * Overflow, abort
-		 */
-		if (end < start)
-			return ERR_PTR(-EINVAL);
-
-		nr_pages += end - start;
-		/*
-		 * buffer must be aligned to at least hardsector size for now
-		 */
-		if (uaddr & queue_dma_alignment(q))
-			return ERR_PTR(-EINVAL);
-	}
-
-	if (!nr_pages)
-		return ERR_PTR(-EINVAL);
-
-	bio = bio_kmalloc(gfp_mask, nr_pages);
-	if (!bio)
-		return ERR_PTR(-ENOMEM);
-
-	ret = -ENOMEM;
-	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
-	if (!pages)
-		goto out;
-
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
-		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long start = uaddr >> PAGE_SHIFT;
-		const int local_nr_pages = end - start;
-		const int page_limit = cur_page + local_nr_pages;
-
-		ret = get_user_pages_fast(uaddr, local_nr_pages,
-				write_to_vm, &pages[cur_page]);
-		if (ret < local_nr_pages) {
-			ret = -EFAULT;
-			goto out_unmap;
-		}
-
-		offset = uaddr & ~PAGE_MASK;
-		for (j = cur_page; j < page_limit; j++) {
-			unsigned int bytes = PAGE_SIZE - offset;
-
-			if (len <= 0)
-				break;
-			
-			if (bytes > len)
-				bytes = len;
-
-			/*
-			 * sorry...
-			 */
-			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
-					    bytes)
-				break;
-
-			len -= bytes;
-			offset = 0;
-		}
-
-		cur_page = j;
-		/*
-		 * release the pages we didn't map into the bio, if any
-		 */
-		while (j < page_limit)
-			page_cache_release(pages[j++]);
-	}
-
-	kfree(pages);
-
-	/*
-	 * set data direction, and check if mapped pages need bouncing
-	 */
-	if (!write_to_vm)
-		bio->bi_rw |= REQ_WRITE;
-
-	bio->bi_bdev = bdev;
-	bio->bi_flags |= (1 << BIO_USER_MAPPED);
-	return bio;
-
- out_unmap:
-	for (i = 0; i < nr_pages; i++) {
-		if(!pages[i])
-			break;
-		page_cache_release(pages[i]);
-	}
- out:
-	kfree(pages);
-	bio_put(bio);
-	return ERR_PTR(ret);
-}
-
-/**
- *	bio_map_user	-	map user address into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm,
-			 gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
-/**
- *	bio_map_user_iov - map user sg_iovec table into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     struct sg_iovec *iov, int iov_count,
-			     int write_to_vm, gfp_t gfp_mask)
-{
-	struct bio *bio;
-
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
-				 gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	/*
-	 * subtle -- if __bio_map_user() ended up bouncing a bio,
-	 * it would normally disappear when its bi_end_io is run.
-	 * however, we need it for the unmap, so grab an extra
-	 * reference to it
-	 */
-	bio_get(bio);
-
-	return bio;
-}
-
-static void __bio_unmap_user(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	/*
-	 * make sure we dirty pages we wrote to
-	 */
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (bio_data_dir(bio) == READ)
-			set_page_dirty_lock(bvec->bv_page);
-
-		page_cache_release(bvec->bv_page);
-	}
-
-	bio_put(bio);
-}
-
-/**
- *	bio_unmap_user	-	unmap a bio
- *	@bio:		the bio being unmapped
- *
- *	Unmap a bio previously mapped by bio_map_user(). Must be called with
- *	a process context.
- *
- *	bio_unmap_user() may sleep.
- */
-void bio_unmap_user(struct bio *bio)
-{
-	__bio_unmap_user(bio);
-	bio_put(bio);
-}
-EXPORT_SYMBOL(bio_unmap_user);
-
-static void bio_map_kern_endio(struct bio *bio, int err)
-{
-	bio_put(bio);
-}
-
-static struct bio *__bio_map_kern(struct request_queue *q, void *data,
-				  unsigned int len, gfp_t gfp_mask)
-{
-	unsigned long kaddr = (unsigned long)data;
-	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = kaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
-	int offset, i;
-	struct bio *bio;
-
-	bio = bio_kmalloc(gfp_mask, nr_pages);
-	if (!bio)
-		return ERR_PTR(-ENOMEM);
-
-	offset = offset_in_page(kaddr);
-	for (i = 0; i < nr_pages; i++) {
-		unsigned int bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
-
-		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
-				    offset) < bytes)
-			break;
-
-		data += bytes;
-		len -= bytes;
-		offset = 0;
-	}
-
-	bio->bi_end_io = bio_map_kern_endio;
-	return bio;
-}
-
-/**
- *	bio_map_kern	-	map kernel address into bio
- *	@q: the struct request_queue for the bio
- *	@data: pointer to buffer to map
- *	@len: length in bytes
- *	@gfp_mask: allocation flags for bio allocation
- *
- *	Map the kernel address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
-			 gfp_t gfp_mask)
-{
-	struct bio *bio;
-
-	bio = __bio_map_kern(q, data, len, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	if (bio->bi_size == len)
-		return bio;
-
-	/*
-	 * Don't support partial mappings.
-	 */
-	bio_put(bio);
-	return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL(bio_map_kern);
-
-static void bio_copy_kern_endio(struct bio *bio, int err)
-{
-	struct bio_vec *bvec;
-	const int read = bio_data_dir(bio) == READ;
-	struct bio_map_data *bmd = bio->bi_private;
-	int i;
-	char *p = bmd->sgvecs[0].iov_base;
-
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		char *addr = page_address(bvec->bv_page);
-		int len = bmd->iovecs[i].bv_len;
-
-		if (read)
-			memcpy(p, addr, len);
-
-		__free_page(bvec->bv_page);
-		p += len;
-	}
-
-	bio_free_map_data(bmd);
-	bio_put(bio);
-}
-
-/**
- *	bio_copy_kern	-	copy kernel address into bio
- *	@q: the struct request_queue for the bio
- *	@data: pointer to buffer to copy
- *	@len: length in bytes
- *	@gfp_mask: allocation flags for bio and page allocation
- *	@reading: data direction is READ
- *
- *	copy the kernel address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
-			  gfp_t gfp_mask, int reading)
-{
-	struct bio *bio;
-	struct bio_vec *bvec;
-	int i;
-
-	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	if (!reading) {
-		void *p = data;
-
-		bio_for_each_segment(bvec, bio, i) {
-			char *addr = page_address(bvec->bv_page);
-
-			memcpy(addr, p, bvec->bv_len);
-			p += bvec->bv_len;
-		}
-	}
-
-	bio->bi_end_io = bio_copy_kern_endio;
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_copy_kern);
-
-/*
- * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
- * for performing direct-IO in BIOs.
- *
- * The problem is that we cannot run set_page_dirty() from interrupt context
- * because the required locks are not interrupt-safe.  So what we can do is to
- * mark the pages dirty _before_ performing IO.  And in interrupt context,
- * check that the pages are still dirty.   If so, fine.  If not, redirty them
- * in process context.
- *
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages.  The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all.  So we skip compound pages here at an early stage.
- *
- * Note that this code is very hard to test under normal circumstances because
- * direct-io pins the pages with get_user_pages().  This makes
- * is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, pdflush) could clean the pages if they are mapped
- * pagecache.
- *
- * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
- * deferred bio dirtying paths.
- */
-
-/*
- * bio_set_pages_dirty() will mark all the bio's pages as dirty.
- */
-void bio_set_pages_dirty(struct bio *bio)
-{
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int i;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct page *page = bvec[i].bv_page;
-
-		if (page && !PageCompound(page))
-			set_page_dirty_lock(page);
-	}
-}
-
-static void bio_release_pages(struct bio *bio)
-{
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int i;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct page *page = bvec[i].bv_page;
-
-		if (page)
-			put_page(page);
-	}
-}
-
-/*
- * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
- * If they are, then fine.  If, however, some pages are clean then they must
- * have been written out during the direct-IO read.  So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
- *
- * It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on.  It will run one page_cache_release() against each page and will
- * run one bio_put() against the BIO.
- */
-
-static void bio_dirty_fn(struct work_struct *work);
-
-static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
-static DEFINE_SPINLOCK(bio_dirty_lock);
-static struct bio *bio_dirty_list;
-
-/*
- * This runs in process context
- */
-static void bio_dirty_fn(struct work_struct *work)
-{
-	unsigned long flags;
-	struct bio *bio;
-
-	spin_lock_irqsave(&bio_dirty_lock, flags);
-	bio = bio_dirty_list;
-	bio_dirty_list = NULL;
-	spin_unlock_irqrestore(&bio_dirty_lock, flags);
-
-	while (bio) {
-		struct bio *next = bio->bi_private;
-
-		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
-		bio_put(bio);
-		bio = next;
-	}
-}
-
-void bio_check_pages_dirty(struct bio *bio)
-{
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int nr_clean_pages = 0;
-	int i;
-
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct page *page = bvec[i].bv_page;
-
-		if (PageDirty(page) || PageCompound(page)) {
-			page_cache_release(page);
-			bvec[i].bv_page = NULL;
-		} else {
-			nr_clean_pages++;
-		}
-	}
-
-	if (nr_clean_pages) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&bio_dirty_lock, flags);
-		bio->bi_private = bio_dirty_list;
-		bio_dirty_list = bio;
-		spin_unlock_irqrestore(&bio_dirty_lock, flags);
-		schedule_work(&bio_dirty_work);
-	} else {
-		bio_put(bio);
-	}
-}
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
-	int i;
-	struct bio_vec *bvec;
-
-	bio_for_each_segment(bvec, bi, i)
-		flush_dcache_page(bvec->bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
-/**
- * bio_endio - end I/O on a bio
- * @bio:	bio
- * @error:	error, if any
- *
- * Description:
- *   bio_endio() will end I/O on the whole bio. bio_endio() is the
- *   preferred way to end I/O on a bio, it takes care of clearing
- *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
- *   established -Exxxx (-EIO, for instance) error values in case
- *   something went wrong. Noone should call bi_end_io() directly on a
- *   bio unless they own it and thus know that it has an end_io
- *   function.
- **/
-void bio_endio(struct bio *bio, int error)
-{
-	if (error)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = -EIO;
-
-	if (bio->bi_end_io)
-		bio->bi_end_io(bio, error);
-}
-EXPORT_SYMBOL(bio_endio);
-
-void bio_pair_release(struct bio_pair *bp)
-{
-	if (atomic_dec_and_test(&bp->cnt)) {
-		struct bio *master = bp->bio1.bi_private;
-
-		bio_endio(master, bp->error);
-		mempool_free(bp, bp->bio2.bi_private);
-	}
-}
-EXPORT_SYMBOL(bio_pair_release);
-
-static void bio_pair_end_1(struct bio *bi, int err)
-{
-	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
-
-	if (err)
-		bp->error = err;
-
-	bio_pair_release(bp);
-}
-
-static void bio_pair_end_2(struct bio *bi, int err)
-{
-	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
-
-	if (err)
-		bp->error = err;
-
-	bio_pair_release(bp);
-}
-
-/*
- * split a bio - only worry about a bio with a single page in its iovec
- */
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
-{
-	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
-
-	if (!bp)
-		return bp;
-
-	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
-				bi->bi_sector + first_sectors);
-
-	BUG_ON(bi->bi_vcnt != 1);
-	BUG_ON(bi->bi_idx != 0);
-	atomic_set(&bp->cnt, 3);
-	bp->error = 0;
-	bp->bio1 = *bi;
-	bp->bio2 = *bi;
-	bp->bio2.bi_sector += first_sectors;
-	bp->bio2.bi_size -= first_sectors << 9;
-	bp->bio1.bi_size = first_sectors << 9;
-
-	bp->bv1 = bi->bi_io_vec[0];
-	bp->bv2 = bi->bi_io_vec[0];
-	bp->bv2.bv_offset += first_sectors << 9;
-	bp->bv2.bv_len -= first_sectors << 9;
-	bp->bv1.bv_len = first_sectors << 9;
-
-	bp->bio1.bi_io_vec = &bp->bv1;
-	bp->bio2.bi_io_vec = &bp->bv2;
-
-	bp->bio1.bi_max_vecs = 1;
-	bp->bio2.bi_max_vecs = 1;
-
-	bp->bio1.bi_end_io = bio_pair_end_1;
-	bp->bio2.bi_end_io = bio_pair_end_2;
-
-	bp->bio1.bi_private = bi;
-	bp->bio2.bi_private = bio_split_pool;
-
-	if (bio_integrity(bi))
-		bio_integrity_split(bi, bp, first_sectors);
-
-	return bp;
-}
-EXPORT_SYMBOL(bio_split);
-
-/**
- *      bio_sector_offset - Find hardware sector offset in bio
- *      @bio:           bio to inspect
- *      @index:         bio_vec index
- *      @offset:        offset in bv_page
- *
- *      Return the number of hardware sectors between beginning of bio
- *      and an end point indicated by a bio_vec index and an offset
- *      within that vector's page.
- */
-sector_t bio_sector_offset(struct bio *bio, unsigned short index,
-			   unsigned int offset)
-{
-	unsigned int sector_sz;
-	struct bio_vec *bv;
-	sector_t sectors;
-	int i;
-
-	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
-	sectors = 0;
-
-	if (index >= bio->bi_idx)
-		index = bio->bi_vcnt - 1;
-
-	__bio_for_each_segment(bv, bio, i, 0) {
-		if (i == index) {
-			if (offset > bv->bv_offset)
-				sectors += (offset - bv->bv_offset) / sector_sz;
-			break;
-		}
-
-		sectors += bv->bv_len / sector_sz;
-	}
-
-	return sectors;
-}
-EXPORT_SYMBOL(bio_sector_offset);
-
-/*
- * create memory pools for biovec's in a bio_set.
- * use the global biovec slabs created for general use.
- */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries)
-{
-	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-
-	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-	if (!bs->bvec_pool)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static void biovec_free_pools(struct bio_set *bs)
-{
-	mempool_destroy(bs->bvec_pool);
-}
-
-void bioset_free(struct bio_set *bs)
-{
-	if (bs->bio_pool)
-		mempool_destroy(bs->bio_pool);
-
-	bioset_integrity_free(bs);
-	biovec_free_pools(bs);
-	bio_put_slab(bs);
-
-	kfree(bs);
-}
-EXPORT_SYMBOL(bioset_free);
-
-/**
- * bioset_create  - Create a bio_set
- * @pool_size:	Number of bio and bio_vecs to cache in the mempool
- * @front_pad:	Number of bytes to allocate in front of the returned bio
- *
- * Description:
- *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
- *    to ask for a number of bytes to be allocated in front of the bio.
- *    Front pad allocation is useful for embedding the bio inside
- *    another structure, to avoid allocating extra data to go with the bio.
- *    Note that the bio must be embedded at the END of that structure always,
- *    or things will break badly.
- */
-struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
-{
-	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-	struct bio_set *bs;
-
-	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
-	if (!bs)
-		return NULL;
-
-	bs->front_pad = front_pad;
-
-	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
-	if (!bs->bio_slab) {
-		kfree(bs);
-		return NULL;
-	}
-
-	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
-	if (!bs->bio_pool)
-		goto bad;
-
-	if (bioset_integrity_create(bs, pool_size))
-		goto bad;
-
-	if (!biovec_create_pools(bs, pool_size))
-		return bs;
-
-bad:
-	bioset_free(bs);
-	return NULL;
-}
-EXPORT_SYMBOL(bioset_create);
-
-static void __init biovec_init_slabs(void)
-{
-	int i;
-
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		int size;
-		struct biovec_slab *bvs = bvec_slabs + i;
-
-#ifndef CONFIG_BLK_DEV_INTEGRITY
-		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
-			bvs->slab = NULL;
-			continue;
-		}
-#endif
-
-		size = bvs->nr_vecs * sizeof(struct bio_vec);
-		bvs->slab = kmem_cache_create(bvs->name, size, 0,
-                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	}
-}
-
-static int __init init_bio(void)
-{
-	bio_slab_max = 2;
-	bio_slab_nr = 0;
-	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
-	if (!bio_slabs)
-		panic("bio: can't allocate bios\n");
-
-	bio_integrity_init();
-	biovec_init_slabs();
-
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
-	if (!fs_bio_set)
-		panic("bio: can't allocate bios\n");
-
-	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
-						     sizeof(struct bio_pair));
-	if (!bio_split_pool)
-		panic("bio: can't create split pool\n");
-
-	return 0;
-}
-subsys_initcall(init_bio);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2bbc0e62102..6d7274619bf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -16,7 +16,9 @@
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
+#include <linux/magic.h>
 #include <linux/buffer_head.h>
+#include <linux/swap.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
@@ -24,7 +26,8 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
-#include <linux/kmemleak.h>
+#include <linux/cleancache.h>
+#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -44,47 +47,67 @@ inline struct block_device *I_BDEV(struct inode *inode)
 {
 	return &BDEV_I(inode)->bdev;
 }
-
 EXPORT_SYMBOL(I_BDEV);
 
 /*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
  */
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+	bool wakeup_bdi = false;
+
+	if (unlikely(dst == old))		/* deadlock avoidance */
+		return;
+	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
-	if (inode->i_state & I_DIRTY)
+	if (inode->i_state & I_DIRTY) {
+		if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
+			wakeup_bdi = true;
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+	}
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
-}
+	spin_unlock(&old->wb.list_lock);
+	spin_unlock(&dst->wb.list_lock);
 
-static sector_t max_block(struct block_device *bdev)
-{
-	sector_t retval = ~((sector_t)0);
-	loff_t sz = i_size_read(bdev->bd_inode);
-
-	if (sz) {
-		unsigned int size = block_size(bdev);
-		unsigned int sizebits = blksize_bits(size);
-		retval = (sz >> sizebits);
-	}
-	return retval;
+	if (wakeup_bdi)
+		bdi_wakeup_thread_delayed(dst);
 }
 
 /* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
+void kill_bdev(struct block_device *bdev)
 {
-	if (bdev->bd_inode->i_mapping->nrpages == 0)
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
 		return;
+
 	invalidate_bh_lrus();
-	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+	truncate_inode_pages(mapping, 0);
 }	
+EXPORT_SYMBOL(kill_bdev);
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	invalidate_bh_lrus();
+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
+	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_invalidate_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
 
 int set_blocksize(struct block_device *bdev, int size)
 {
@@ -135,61 +158,22 @@ static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
-	if (iblock >= max_block(I_BDEV(inode))) {
-		if (create)
-			return -EIO;
-
-		/*
-		 * for reads, we're just trying to fill a partial page.
-		 * return a hole, they will have to call get_block again
-		 * before they can fill it, and they will get -EIO at that
-		 * time
-		 */
-		return 0;
-	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	set_buffer_mapped(bh);
 	return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh, int create)
-{
-	sector_t end_block = max_block(I_BDEV(inode));
-	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
-	if ((iblock + max_blocks) > end_block) {
-		max_blocks = end_block - iblock;
-		if ((long)max_blocks <= 0) {
-			if (create)
-				return -EIO;	/* write fully beyond EOF */
-			/*
-			 * It is a read which is fully beyond EOF.  We return
-			 * a !buffer_mapped buffer
-			 */
-			max_blocks = 0;
-		}
-	}
-
-	bh->b_bdev = I_BDEV(inode);
-	bh->b_blocknr = iblock;
-	bh->b_size = max_blocks << inode->i_blkbits;
-	if (max_blocks)
-		set_buffer_mapped(bh);
-	return 0;
-}
-
 static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-			loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+			loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+				    offset, blkdev_get_block,
+				    NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -343,59 +327,106 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
 
 /*
  * private llseek:
- * for a block special file file->f_path.dentry->d_inode->i_size is zero
+ * for a block special file file_inode(file)->i_size is zero
  * so we compute the size by hand (just as in block_read/write above)
  */
-static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *bd_inode = file->f_mapping->host;
-	loff_t size;
 	loff_t retval;
 
 	mutex_lock(&bd_inode->i_mutex);
-	size = i_size_read(bd_inode);
-
-	switch (origin) {
-		case 2:
-			offset += size;
-			break;
-		case 1:
-			offset += file->f_pos;
-	}
-	retval = -EINVAL;
-	if (offset >= 0 && offset <= size) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-		}
-		retval = offset;
-	}
+	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
 	mutex_unlock(&bd_inode->i_mutex);
 	return retval;
 }
 	
-int blkdev_fsync(struct file *filp, int datasync)
+int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *bd_inode = filp->f_mapping->host;
 	struct block_device *bdev = I_BDEV(bd_inode);
 	int error;
+	
+	error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+	if (error)
+		return error;
 
 	/*
 	 * There is no need to serialise calls to blkdev_issue_flush with
 	 * i_mutex and doing so causes performance issues with concurrent
 	 * O_SYNC writers to a block device.
 	 */
-	mutex_unlock(&bd_inode->i_mutex);
-
 	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
-	mutex_lock(&bd_inode->i_mutex);
-
 	return error;
 }
 EXPORT_SYMBOL(blkdev_fsync);
 
+/**
+ * bdev_read_page() - Start reading a page from a block device
+ * @bdev: The device to read the page from
+ * @sector: The offset on the device to read the page to (need not be aligned)
+ * @page: The page to read
+ *
+ * On entry, the page should be locked.  It will be unlocked when the page
+ * has been read.  If the block driver implements rw_page synchronously,
+ * that will be true on exit from this function, but it need not be.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to read this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_read_page(struct block_device *bdev, sector_t sector,
+			struct page *page)
+{
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+	if (!ops->rw_page)
+		return -EOPNOTSUPP;
+	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+}
+EXPORT_SYMBOL_GPL(bdev_read_page);
+
+/**
+ * bdev_write_page() - Start writing a page to a block device
+ * @bdev: The device to write the page to
+ * @sector: The offset on the device to write the page to (need not be aligned)
+ * @page: The page to write
+ * @wbc: The writeback_control for the write
+ *
+ * On entry, the page should be locked and not currently under writeback.
+ * On exit, if the write started successfully, the page will be unlocked and
+ * under writeback.  If the write failed already (eg the driver failed to
+ * queue the page to the device), the page will still be locked.  If the
+ * caller is a ->writepage implementation, it will need to unlock the page.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to write this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_write_page(struct block_device *bdev, sector_t sector,
+			struct page *page, struct writeback_control *wbc)
+{
+	int result;
+	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
+	const struct block_device_operations *ops = bdev->bd_disk->fops;
+	if (!ops->rw_page)
+		return -EOPNOTSUPP;
+	set_page_writeback(page);
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+	if (result)
+		end_page_writeback(page);
+	else
+		unlock_page(page);
+	return result;
+}
+EXPORT_SYMBOL_GPL(bdev_write_page);
+
 /*
  * pseudo-fs
  */
@@ -416,7 +447,6 @@ static void bdev_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
@@ -453,9 +483,9 @@ static void bdev_evict_inode(struct inode *inode)
 {
 	struct block_device *bdev = &BDEV_I(inode)->bdev;
 	struct list_head *p;
-	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages_final(&inode->i_data);
 	invalidate_inode_buffers(inode); /* is it needed here? */
-	end_writeback(inode);
+	clear_inode(inode);
 	spin_lock(&bdev_lock);
 	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
 		__bd_forget(list_entry(p, struct inode, i_devices));
@@ -475,7 +505,7 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
 }
 
 static struct file_system_type bd_type = {
@@ -484,12 +514,12 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-struct super_block *blockdev_superblock __read_mostly;
+static struct super_block *blockdev_superblock __read_mostly;
 
 void __init bdev_cache_init(void)
 {
 	int err;
-	struct vfsmount *bd_mnt;
+	static struct vfsmount *bd_mnt;
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -501,12 +531,7 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
-	/*
-	 * This vfsmount structure is only used to obtain the
-	 * blockdev_superblock, so tell kmemleak not to report it.
-	 */
-	kmemleak_not_leak(bd_mnt);
-	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
+	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
 
 /*
@@ -547,6 +572,7 @@ struct block_device *bdget(dev_t dev)
 
 	if (inode->i_state & I_NEW) {
 		bdev->bd_contains = NULL;
+		bdev->bd_super = NULL;
 		bdev->bd_inode = inode;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
@@ -576,6 +602,7 @@ struct block_device *bdgrab(struct block_device *bdev)
 	ihold(bdev->bd_inode);
 	return bdev;
 }
+EXPORT_SYMBOL(bdgrab);
 
 long nr_blockdev_pages(void)
 {
@@ -629,6 +656,11 @@ static struct block_device *bd_acquire(struct inode *inode)
 	return bdev;
 }
 
+int sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
+
 /* Call when you free inode */
 
 void bd_forget(struct inode *inode)
@@ -636,11 +668,9 @@ void bd_forget(struct inode *inode)
 	struct block_device *bdev = NULL;
 
 	spin_lock(&bdev_lock);
-	if (inode->i_bdev) {
-		if (!sb_is_blkdev_sb(inode->i_sb))
-			bdev = inode->i_bdev;
-		__bd_forget(inode);
-	}
+	if (!sb_is_blkdev_sb(inode->i_sb))
+		bdev = inode->i_bdev;
+	__bd_forget(inode);
 	spin_unlock(&bdev_lock);
 
 	if (bdev)
@@ -653,7 +683,7 @@ void bd_forget(struct inode *inode)
  * @whole: whole block device containing @bdev, may equal @bdev
  * @holder: holder trying to claim @bdev
  *
- * Test whther @bdev can be claimed by @holder.
+ * Test whether @bdev can be claimed by @holder.
  *
  * CONTEXT:
  * spin_lock(&bdev_lock).
@@ -762,7 +792,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
 	if (!disk)
 		return ERR_PTR(-ENXIO);
 
-	whole = bdget_disk(disk, 0);
+	/*
+	 * Normally, @bdev should equal what's returned from bdget_disk()
+	 * if partno is 0; however, some drivers (floppy) use multiple
+	 * bdev's for the same physical device and @bdev may be one of the
+	 * aliases.  Keep @bdev if partno is 0.  This means claimer
+	 * tracking is broken for those devices but it has always been that
+	 * way.
+	 */
+	if (partno)
+		whole = bdget_disk(disk, 0);
+	else
+		whole = bdgrab(bdev);
+
 	module_put(disk->fops->owner);
 	put_disk(disk);
 	if (!whole)
@@ -949,7 +991,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
 
 	if (!bdev->bd_disk)
 		return;
-	if (disk_partitionable(bdev->bd_disk))
+	if (disk_part_scan_enabled(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 }
 
@@ -1002,6 +1044,7 @@ int revalidate_disk(struct gendisk *disk)
 
 	mutex_lock(&bdev->bd_mutex);
 	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	return ret;
@@ -1040,7 +1083,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
-	bdev->bd_inode->i_size = size;
+	mutex_lock(&bdev->bd_inode->i_mutex);
+	i_size_write(bdev->bd_inode, size);
+	mutex_unlock(&bdev->bd_inode->i_mutex);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
 			break;
@@ -1051,7 +1096,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
  * bd_mutex locking:
@@ -1063,6 +1108,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
+	struct module *owner;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1088,10 +1134,13 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out;
+	owner = disk->fops->owner;
 
+	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
+		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
@@ -1101,6 +1150,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (!bdev->bd_part)
 				goto out_clear;
 
+			ret = 0;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
 				if (ret == -ERESTARTSYS) {
@@ -1110,24 +1160,38 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					 */
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
-					module_put(disk->fops->owner);
-					put_disk(disk);
 					bdev->bd_disk = NULL;
+					bdev->bd_queue = NULL;
 					mutex_unlock(&bdev->bd_mutex);
+					disk_unblock_events(disk);
+					put_disk(disk);
+					module_put(owner);
 					goto restart;
 				}
-				if (ret)
-					goto out_clear;
 			}
-			if (!bdev->bd_openers) {
+
+			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
 				bdi = blk_get_backing_dev_info(bdev);
 				if (bdi == NULL)
 					bdi = &default_backing_dev_info;
 				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
 			}
-			if (bdev->bd_invalidated)
-				rescan_partitions(disk, bdev);
+
+			/*
+			 * If the device is invalidated, rescan partition
+			 * if open succeeded or failed with -ENOMEDIUM.
+			 * The latter is necessary to prevent ghost
+			 * partitions on a removed medium.
+			 */
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(disk, bdev);
+			}
+			if (ret)
+				goto out_clear;
 		} else {
 			struct block_device *whole;
 			whole = bdget_disk(disk, 0);
@@ -1150,39 +1214,46 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
-		module_put(disk->fops->owner);
-		put_disk(disk);
-		disk = NULL;
 		if (bdev->bd_contains == bdev) {
-			if (bdev->bd_disk->fops->open) {
+			ret = 0;
+			if (bdev->bd_disk->fops->open)
 				ret = bdev->bd_disk->fops->open(bdev, mode);
-				if (ret)
-					goto out_unlock_bdev;
+			/* the same as first opener case, read comment there */
+			if (bdev->bd_invalidated) {
+				if (!ret)
+					rescan_partitions(bdev->bd_disk, bdev);
+				else if (ret == -ENOMEDIUM)
+					invalidate_partitions(bdev->bd_disk, bdev);
 			}
-			if (bdev->bd_invalidated)
-				rescan_partitions(bdev->bd_disk, bdev);
+			if (ret)
+				goto out_unlock_bdev;
 		}
+		/* only one opener holds refs to the module and disk */
+		put_disk(disk);
+		module_put(owner);
 	}
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
 	return 0;
 
  out_clear:
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
+	bdev->bd_queue = NULL;
 	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
- out:
-	if (disk)
-		module_put(disk->fops->owner);
+	disk_unblock_events(disk);
 	put_disk(disk);
+	module_put(owner);
+ out:
 	bdput(bdev);
 
 	return ret;
@@ -1225,6 +1296,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 	res = __blkdev_get(bdev, mode, 0);
 
 	if (whole) {
+		struct gendisk *disk = whole->bd_disk;
+
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
 		spin_lock(&bdev_lock);
@@ -1251,15 +1324,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 		spin_unlock(&bdev_lock);
 
 		/*
-		 * Block event polling for write claims.  Any write
-		 * holder makes the write_holder state stick until all
-		 * are released.  This is good enough and tracking
-		 * individual writeable reference is too fragile given
-		 * the way @mode is used in blkdev_get/put().
+		 * Block event polling for write claims if requested.  Any
+		 * write holder makes the write_holder state stick until
+		 * all are released.  This is good enough and tracking
+		 * individual writeable reference is too fragile given the
+		 * way @mode is used in blkdev_get/put().
 		 */
-		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
 			bdev->bd_write_holder = true;
-			disk_block_events(bdev->bd_disk);
+			disk_block_events(disk);
 		}
 
 		mutex_unlock(&bdev->bd_mutex);
@@ -1377,9 +1451,8 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
-	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
@@ -1391,34 +1464,39 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		WARN_ON_ONCE(bdev->bd_holders);
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
+		/* ->release can cause the old bdi to disappear,
+		 * so must switch it out first
+		 */
+		bdev_inode_switch_bdi(bdev->bd_inode,
+					&default_backing_dev_info);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-			ret = disk->fops->release(disk, mode);
+			disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 
-		put_disk(disk);
-		module_put(owner);
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
-		bdev_inode_switch_bdi(bdev->bd_inode,
-					&default_backing_dev_info);
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
+
+		put_disk(disk);
+		module_put(owner);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
 		__blkdev_put(victim, mode, 1);
-	return ret;
 }
 
-int blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+	mutex_lock(&bdev->bd_mutex);
+
 	if (mode & FMODE_EXCL) {
 		bool bdev_free;
 
@@ -1427,7 +1505,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 		 * are protected with bdev_lock.  bd_mutex is to
 		 * synchronize disk_holder unlinking.
 		 */
-		mutex_lock(&bdev->bd_mutex);
 		spin_lock(&bdev_lock);
 
 		WARN_ON_ONCE(--bdev->bd_holders < 0);
@@ -1445,27 +1522,30 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 		 * If this was the last claim, remove holder link and
 		 * unblock evpoll if it was a write holder.
 		 */
-		if (bdev_free) {
-			if (bdev->bd_write_holder) {
-				disk_unblock_events(bdev->bd_disk);
-				bdev->bd_write_holder = false;
-			} else
-				disk_check_events(bdev->bd_disk);
+		if (bdev_free && bdev->bd_write_holder) {
+			disk_unblock_events(bdev->bd_disk);
+			bdev->bd_write_holder = false;
 		}
+	}
 
-		mutex_unlock(&bdev->bd_mutex);
-	} else
-		disk_check_events(bdev->bd_disk);
+	/*
+	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+	 * event.  This is to ensure detection of media removal commanded
+	 * from userland - e.g. eject(1).
+	 */
+	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
 
-	return __blkdev_put(bdev, mode, 0);
+	mutex_unlock(&bdev->bd_mutex);
+
+	__blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-
-	return blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode);
+	return 0;
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1492,25 +1572,39 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
  * Does not take i_mutex for the write and thus is not for general purpose
  * use.
  */
-ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
-			 unsigned long nr_segs, loff_t pos)
+ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
+	struct blk_plug plug;
 	ssize_t ret;
 
-	BUG_ON(iocb->ki_pos != pos);
-
-	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
-	if (ret > 0 || ret == -EIOCBQUEUED) {
+	blk_start_plug(&plug);
+	ret = __generic_file_write_iter(iocb, from);
+	if (ret > 0) {
 		ssize_t err;
-
-		err = generic_write_sync(file, pos, ret);
-		if (err < 0 && ret > 0)
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
 			ret = err;
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkdev_aio_write);
+EXPORT_SYMBOL_GPL(blkdev_write_iter);
+
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *bd_inode = file->f_mapping->host;
+	loff_t size = i_size_read(bd_inode);
+	loff_t pos = iocb->ki_pos;
+
+	if (pos >= size)
+		return 0;
+
+	size -= pos;
+	iov_iter_truncate(to, size);
+	return generic_file_read_iter(iocb, to);
+}
 
 /*
  * Try to release a page associated with block device when the system
@@ -1529,22 +1623,22 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-  	.aio_read	= generic_file_aio_read,
-	.aio_write	= blkdev_aio_write,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= blkdev_read_iter,
+	.write_iter	= blkdev_write_iter,
 	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
@@ -1552,7 +1646,7 @@ const struct file_operations def_blk_fops = {
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
@@ -1629,3 +1723,39 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 	return res;
 }
 EXPORT_SYMBOL(__invalidate_device);
+
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+	struct inode *inode, *old_inode = NULL;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+		struct address_space *mapping = inode->i_mapping;
+
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+		    mapping->nrpages == 0) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_sb_list_lock);
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
+		iput(old_inode);
+		old_inode = inode;
+
+		func(I_BDEV(inode), arg);
+
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be14..a66768ebc8d 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,18 +1,26 @@
 config BTRFS_FS
-	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
-	depends on EXPERIMENTAL
-	select LIBCRC32C
+	tristate "Btrfs filesystem support"
+	select CRYPTO
+	select CRYPTO_CRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select RAID6_PQ
+	select XOR_BLOCKS
+
 	help
-	  Btrfs is a new filesystem with extents, writable snapshotting,
-	  support for multiple devices and many more features.
+	  Btrfs is a general purpose copy-on-write filesystem with extents,
+	  writable snapshotting, support for multiple devices and many more
+	  features focused on fault tolerance, repair and easy administration.
+
+	  The filesystem disk format is no longer unstable, and it's not
+	  expected to change unless there are strong reasons to do so. If there
+	  is a format change, file systems with a unchanged format will
+	  continue to be mountable and usable by newer kernels.
 
-	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
-	  FINALIZED.  You should say N here unless you are interested in
-	  testing Btrfs with non-critical data.
+	  For more information, please see the web pages at
+	  http://btrfs.wiki.kernel.org.
 
 	  To compile this file system support as a module, choose M here. The
 	  module will be called btrfs.
@@ -31,3 +39,52 @@ config BTRFS_FS_POSIX_ACL
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+	bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+	depends on BTRFS_FS
+	help
+	  Adds code that examines all block write requests (including
+	  writes of the super block). The goal is to verify that the
+	  state of the filesystem on disk is always consistent, i.e.,
+	  after a power-loss or kernel panic event the filesystem is
+	  in a consistent state.
+
+	  If the integrity check tool is included and activated in
+	  the mount options, plenty of kernel memory is used, and
+	  plenty of additional CPU cycles are spent. Enabling this
+	  functionality is not intended for normal use.
+
+	  In most cases, unless you are a btrfs developer who needs
+	  to verify the integrity of (super)-block write requests
+	  during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+	bool "Btrfs will run sanity tests upon loading"
+	depends on BTRFS_FS
+	help
+	  This will run some basic sanity tests on the free space cache
+	  code to make sure it is acting as it should.  These are mostly
+	  regression tests and are only really interesting to btrfs
+	  developers.
+
+	  If unsure, say N.
+
+config BTRFS_DEBUG
+	bool "Btrfs debugging support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time debugging support for the btrfs filesystem. This may
+	  enable additional and expensive checks with negative impact on
+	  performance, or export extra information via sysfs.
+
+	  If unsure, say N.
+
+config BTRFS_ASSERT
+	bool "Btrfs assert support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time assertion checking.  This will result in panics if
+	  any of the assertions trip.  This is meant for btrfs developers only.
+
+	  If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 31610ea73ae..6d1d0b93b1a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,14 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
-	   compression.o delayed-ref.o relocation.o
+	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+	   uuid-tree.o props.o hash.o
+
+btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+	tests/extent-buffer-tests.o tests/btrfs-tests.o \
+	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index de34bfad9ec..9a0124a9585 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -28,22 +28,13 @@
 #include "btrfs_inode.h"
 #include "xattr.h"
 
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-
-static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
 	int size;
 	const char *name;
 	char *value = NULL;
 	struct posix_acl *acl;
 
-	if (!IS_POSIXACL(inode))
-		return NULL;
-
-	acl = get_cached_acl(inode, type);
-	if (acl != ACL_NOT_CACHED)
-		return acl;
-
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
@@ -61,74 +52,42 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		size = __btrfs_getxattr(inode, name, value, size);
-		if (size > 0) {
-			acl = posix_acl_from_xattr(value, size);
-			if (IS_ERR(acl)) {
-				kfree(value);
-				return acl;
-			}
-			set_cached_acl(inode, type, acl);
-		}
-		kfree(value);
+	}
+	if (size > 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
 		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
-		set_cached_acl(inode, type, acl);
 	} else {
 		acl = ERR_PTR(-EIO);
 	}
+	kfree(value);
 
-	return acl;
-}
-
-static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
-		void *value, size_t size, int type)
-{
-	struct posix_acl *acl;
-	int ret = 0;
-
-	if (!IS_POSIXACL(dentry->d_inode))
-		return -EOPNOTSUPP;
-
-	acl = btrfs_get_acl(dentry->d_inode, type);
-
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	ret = posix_acl_to_xattr(acl, value, size);
-	posix_acl_release(acl);
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-	return ret;
+	return acl;
 }
 
 /*
  * Needs to be called with fs_mutex held
  */
-static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 			 struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret, size = 0;
 	const char *name;
 	char *value = NULL;
-	mode_t mode;
-
-	if (acl) {
-		ret = posix_acl_valid(acl);
-		if (ret < 0)
-			return ret;
-		ret = 0;
-	}
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		mode = inode->i_mode;
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			ret = posix_acl_equiv_mode(acl, &mode);
+			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (ret < 0)
 				return ret;
-			inode->i_mode = mode;
+			if (ret == 0)
+				acl = NULL;
 		}
 		ret = 0;
 		break;
@@ -149,7 +108,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 
-		ret = posix_acl_to_xattr(acl, value, size);
+		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
 		if (ret < 0)
 			goto out;
 	}
@@ -164,55 +123,9 @@ out:
 	return ret;
 }
 
-static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags, int type)
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	int ret;
-	struct posix_acl *acl = NULL;
-
-	if (!inode_owner_or_capable(dentry->d_inode))
-		return -EPERM;
-
-	if (!IS_POSIXACL(dentry->d_inode))
-		return -EOPNOTSUPP;
-
-	if (value) {
-		acl = posix_acl_from_xattr(value, size);
-		if (acl == NULL) {
-			value = NULL;
-			size = 0;
-		} else if (IS_ERR(acl)) {
-			return PTR_ERR(acl);
-		}
-	}
-
-	ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
-
-	posix_acl_release(acl);
-
-	return ret;
-}
-
-int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
-{
-	int error = -EAGAIN;
-
-	if (flags & IPERM_FLAG_RCU) {
-		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
-			error = -ECHILD;
-
-	} else {
-		struct posix_acl *acl;
-		acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		if (acl) {
-			error = posix_acl_permission(inode, acl, mask);
-			posix_acl_release(acl);
-		}
-	}
-
-	return error;
+	return __btrfs_set_acl(NULL, inode, acl, type);
 }
 
 /*
@@ -223,111 +136,31 @@ int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
+	struct posix_acl *default_acl, *acl;
 	int ret = 0;
 
 	/* this happens with subvols */
 	if (!dir)
 		return 0;
 
-	if (!S_ISLNK(inode->i_mode)) {
-		if (IS_POSIXACL(dir)) {
-			acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
+	ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (ret)
+		return ret;
 
-		if (!acl)
-			inode->i_mode &= ~current_umask();
+	if (default_acl) {
+		ret = __btrfs_set_acl(trans, inode, default_acl,
+				      ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
 	}
 
-	if (IS_POSIXACL(dir) && acl) {
-		struct posix_acl *clone;
-		mode_t mode;
-
-		if (S_ISDIR(inode->i_mode)) {
-			ret = btrfs_set_acl(trans, inode, acl,
-					    ACL_TYPE_DEFAULT);
-			if (ret)
-				goto failed;
-		}
-		clone = posix_acl_clone(acl, GFP_NOFS);
-		ret = -ENOMEM;
-		if (!clone)
-			goto failed;
-
-		mode = inode->i_mode;
-		ret = posix_acl_create_masq(clone, &mode);
-		if (ret >= 0) {
-			inode->i_mode = mode;
-			if (ret > 0) {
-				/* we need an acl */
-				ret = btrfs_set_acl(trans, inode, clone,
-						    ACL_TYPE_ACCESS);
-			}
-		}
-		posix_acl_release(clone);
+	if (acl) {
+		if (!ret)
+			ret = __btrfs_set_acl(trans, inode, acl,
+					      ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
 	}
-failed:
-	posix_acl_release(acl);
-
-	return ret;
-}
-
-int btrfs_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl, *clone;
-	int ret = 0;
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	if (!IS_POSIXACL(inode))
-		return 0;
-
-	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-
-	clone = posix_acl_clone(acl, GFP_KERNEL);
-	posix_acl_release(acl);
-	if (!clone)
-		return -ENOMEM;
-
-	ret = posix_acl_chmod_masq(clone, inode->i_mode);
-	if (!ret)
-		ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
-
-	posix_acl_release(clone);
 
+	if (!default_acl && !acl)
+		cache_no_acl(inode);
 	return ret;
 }
-
-const struct xattr_handler btrfs_xattr_acl_default_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.get	= btrfs_xattr_acl_get,
-	.set	= btrfs_xattr_acl_set,
-};
-
-const struct xattr_handler btrfs_xattr_acl_access_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.get	= btrfs_xattr_acl_get,
-	.set	= btrfs_xattr_acl_set,
-};
-
-#else /* CONFIG_BTRFS_FS_POSIX_ACL */
-
-int btrfs_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
-		   struct inode *inode, struct inode *dir)
-{
-	return 0;
-}
-
-#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef..5a201d81049 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -21,698 +22,315 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
+#include <linux/workqueue.h>
 #include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+	struct workqueue_struct *normal_wq;
+	/* List head pointing to ordered work list */
+	struct list_head ordered_list;
+
+	/* Spinlock for ordered_list */
+	spinlock_t list_lock;
+
+	/* Thresholding related variants */
+	atomic_t pending;
+	int max_active;
+	int current_max;
+	int thresh;
+	unsigned int count;
+	spinlock_t thres_lock;
+};
 
-#define WORK_QUEUED_BIT 0
-#define WORK_DONE_BIT 1
-#define WORK_ORDER_DONE_BIT 2
-#define WORK_HIGH_PRIO_BIT 3
-
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
-	/* pool we belong to */
-	struct btrfs_workers *workers;
-
-	/* list of struct btrfs_work that are waiting for service */
-	struct list_head pending;
-	struct list_head prio_pending;
-
-	/* list of worker threads from struct btrfs_workers */
-	struct list_head worker_list;
+struct btrfs_workqueue {
+	struct __btrfs_workqueue *normal;
+	struct __btrfs_workqueue *high;
+};
 
-	/* kthread */
-	struct task_struct *task;
+static inline struct __btrfs_workqueue
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+			 int thresh)
+{
+	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
-	/* number of things on the pending list */
-	atomic_t num_pending;
+	if (unlikely(!ret))
+		return NULL;
 
-	/* reference counter for this struct */
-	atomic_t refs;
+	ret->max_active = max_active;
+	atomic_set(&ret->pending, 0);
+	if (thresh == 0)
+		thresh = DFT_THRESHOLD;
+	/* For low threshold, disabling threshold is a better choice */
+	if (thresh < DFT_THRESHOLD) {
+		ret->current_max = max_active;
+		ret->thresh = NO_THRESHOLD;
+	} else {
+		ret->current_max = 1;
+		ret->thresh = thresh;
+	}
 
-	unsigned long sequence;
+	if (flags & WQ_HIGHPRI)
+		ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+						 ret->max_active,
+						 "btrfs", name);
+	else
+		ret->normal_wq = alloc_workqueue("%s-%s", flags,
+						 ret->max_active, "btrfs",
+						 name);
+	if (unlikely(!ret->normal_wq)) {
+		kfree(ret);
+		return NULL;
+	}
 
-	/* protects the pending list. */
-	spinlock_t lock;
+	INIT_LIST_HEAD(&ret->ordered_list);
+	spin_lock_init(&ret->list_lock);
+	spin_lock_init(&ret->thres_lock);
+	trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+	return ret;
+}
 
-	/* set to non-zero when this thread is already awake and kicking */
-	int working;
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
-	/* are we currently idle */
-	int idle;
-};
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+					      int flags,
+					      int max_active,
+					      int thresh)
+{
+	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
-/*
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
- * for a very long time.  It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations.  This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
-	struct btrfs_work work;
-	struct btrfs_workers *queue;
-};
+	if (unlikely(!ret))
+		return NULL;
 
-static void start_new_worker_func(struct btrfs_work *work)
-{
-	struct worker_start *start;
-	start = container_of(work, struct worker_start, work);
-	btrfs_start_workers(start->queue, 1);
-	kfree(start);
-}
+	ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+					      max_active, thresh);
+	if (unlikely(!ret->normal)) {
+		kfree(ret);
+		return NULL;
+	}
 
-static int start_new_worker(struct btrfs_workers *queue)
-{
-	struct worker_start *start;
-	int ret;
-
-	start = kzalloc(sizeof(*start), GFP_NOFS);
-	if (!start)
-		return -ENOMEM;
-
-	start->work.func = start_new_worker_func;
-	start->queue = queue;
-	ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
-	if (ret)
-		kfree(start);
+	if (flags & WQ_HIGHPRI) {
+		ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+						    thresh);
+		if (unlikely(!ret->high)) {
+			__btrfs_destroy_workqueue(ret->normal);
+			kfree(ret);
+			return NULL;
+		}
+	}
 	return ret;
 }
 
 /*
- * helper function to move a thread onto the idle list after it
- * has finished some requests.
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
  */
-static void check_idle_worker(struct btrfs_worker_thread *worker)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
 {
-	if (!worker->idle && atomic_read(&worker->num_pending) <
-	    worker->workers->idle_thresh / 2) {
-		unsigned long flags;
-		spin_lock_irqsave(&worker->workers->lock, flags);
-		worker->idle = 1;
-
-		/* the list may be empty if the worker is just starting */
-		if (!list_empty(&worker->worker_list)) {
-			list_move(&worker->worker_list,
-				 &worker->workers->idle_list);
-		}
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
-	}
+	if (wq->thresh == NO_THRESHOLD)
+		return;
+	atomic_inc(&wq->pending);
 }
 
 /*
- * helper function to move a thread off the idle list after new
- * pending work is added.
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
  */
-static void check_busy_worker(struct btrfs_worker_thread *worker)
-{
-	if (worker->idle && atomic_read(&worker->num_pending) >=
-	    worker->workers->idle_thresh) {
-		unsigned long flags;
-		spin_lock_irqsave(&worker->workers->lock, flags);
-		worker->idle = 0;
-
-		if (!list_empty(&worker->worker_list)) {
-			list_move_tail(&worker->worker_list,
-				      &worker->workers->worker_list);
-		}
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
-	}
-}
-
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
 {
-	struct btrfs_workers *workers = worker->workers;
-	unsigned long flags;
+	int new_max_active;
+	long pending;
+	int need_change = 0;
 
-	rmb();
-	if (!workers->atomic_start_pending)
+	if (wq->thresh == NO_THRESHOLD)
 		return;
 
-	spin_lock_irqsave(&workers->lock, flags);
-	if (!workers->atomic_start_pending)
-		goto out;
-
-	workers->atomic_start_pending = 0;
-	if (workers->num_workers + workers->num_workers_starting >=
-	    workers->max_workers)
-		goto out;
-
-	workers->num_workers_starting += 1;
-	spin_unlock_irqrestore(&workers->lock, flags);
-	start_new_worker(workers);
-	return;
+	atomic_dec(&wq->pending);
+	spin_lock(&wq->thres_lock);
+	/*
+	 * Use wq->count to limit the calling frequency of
+	 * workqueue_set_max_active.
+	 */
+	wq->count++;
+	wq->count %= (wq->thresh / 4);
+	if (!wq->count)
+		goto  out;
+	new_max_active = wq->current_max;
 
+	/*
+	 * pending may be changed later, but it's OK since we really
+	 * don't need it so accurate to calculate new_max_active.
+	 */
+	pending = atomic_read(&wq->pending);
+	if (pending > wq->thresh)
+		new_max_active++;
+	if (pending < wq->thresh / 2)
+		new_max_active--;
+	new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+	if (new_max_active != wq->current_max)  {
+		need_change = 1;
+		wq->current_max = new_max_active;
+	}
 out:
-	spin_unlock_irqrestore(&workers->lock, flags);
+	spin_unlock(&wq->thres_lock);
+
+	if (need_change) {
+		workqueue_set_max_active(wq->normal_wq, wq->current_max);
+	}
 }
 
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
-					    struct btrfs_work *work)
+static void run_ordered_work(struct __btrfs_workqueue *wq)
 {
-	if (!workers->ordered)
-		return 0;
-
-	set_bit(WORK_DONE_BIT, &work->flags);
-
-	spin_lock(&workers->order_lock);
+	struct list_head *list = &wq->ordered_list;
+	struct btrfs_work *work;
+	spinlock_t *lock = &wq->list_lock;
+	unsigned long flags;
 
 	while (1) {
-		if (!list_empty(&workers->prio_order_list)) {
-			work = list_entry(workers->prio_order_list.next,
-					  struct btrfs_work, order_list);
-		} else if (!list_empty(&workers->order_list)) {
-			work = list_entry(workers->order_list.next,
-					  struct btrfs_work, order_list);
-		} else {
+		spin_lock_irqsave(lock, flags);
+		if (list_empty(list))
 			break;
-		}
+		work = list_entry(list->next, struct btrfs_work,
+				  ordered_list);
 		if (!test_bit(WORK_DONE_BIT, &work->flags))
 			break;
 
-		/* we are going to call the ordered done function, but
+		/*
+		 * we are going to call the ordered done function, but
 		 * we leave the work item on the list as a barrier so
 		 * that later work items that are done don't have their
 		 * functions called before this one returns
 		 */
 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
 			break;
-
-		spin_unlock(&workers->order_lock);
-
+		trace_btrfs_ordered_sched(work);
+		spin_unlock_irqrestore(lock, flags);
 		work->ordered_func(work);
 
-		/* now take the lock again and call the freeing code */
-		spin_lock(&workers->order_lock);
-		list_del(&work->order_list);
-		work->ordered_free(work);
-	}
+		/* now take the lock again and drop our item from the list */
+		spin_lock_irqsave(lock, flags);
+		list_del(&work->ordered_list);
+		spin_unlock_irqrestore(lock, flags);
 
-	spin_unlock(&workers->order_lock);
-	return 0;
-}
-
-static void put_worker(struct btrfs_worker_thread *worker)
-{
-	if (atomic_dec_and_test(&worker->refs))
-		kfree(worker);
-}
-
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
-	int freeit = 0;
-
-	spin_lock_irq(&worker->lock);
-	spin_lock(&worker->workers->lock);
-	if (worker->workers->num_workers > 1 &&
-	    worker->idle &&
-	    !worker->working &&
-	    !list_empty(&worker->worker_list) &&
-	    list_empty(&worker->prio_pending) &&
-	    list_empty(&worker->pending) &&
-	    atomic_read(&worker->num_pending) == 0) {
-		freeit = 1;
-		list_del_init(&worker->worker_list);
-		worker->workers->num_workers--;
+		/*
+		 * we don't want to call the ordered free functions
+		 * with the lock held though
+		 */
+		work->ordered_free(work);
+		trace_btrfs_all_work_done(work);
 	}
-	spin_unlock(&worker->workers->lock);
-	spin_unlock_irq(&worker->lock);
-
-	if (freeit)
-		put_worker(worker);
-	return freeit;
+	spin_unlock_irqrestore(lock, flags);
 }
 
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
-					struct list_head *prio_head,
-					struct list_head *head)
+static void normal_work_helper(struct work_struct *arg)
 {
-	struct btrfs_work *work = NULL;
-	struct list_head *cur = NULL;
-
-	if(!list_empty(prio_head))
-		cur = prio_head->next;
-
-	smp_mb();
-	if (!list_empty(&worker->prio_pending))
-		goto refill;
-
-	if (!list_empty(head))
-		cur = head->next;
-
-	if (cur)
-		goto out;
-
-refill:
-	spin_lock_irq(&worker->lock);
-	list_splice_tail_init(&worker->prio_pending, prio_head);
-	list_splice_tail_init(&worker->pending, head);
-
-	if (!list_empty(prio_head))
-		cur = prio_head->next;
-	else if (!list_empty(head))
-		cur = head->next;
-	spin_unlock_irq(&worker->lock);
-
-	if (!cur)
-		goto out_fail;
-
-out:
-	work = list_entry(cur, struct btrfs_work, list);
-
-out_fail:
-	return work;
-}
-
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
-{
-	struct btrfs_worker_thread *worker = arg;
-	struct list_head head;
-	struct list_head prio_head;
 	struct btrfs_work *work;
+	struct __btrfs_workqueue *wq;
+	int need_order = 0;
 
-	INIT_LIST_HEAD(&head);
-	INIT_LIST_HEAD(&prio_head);
-
-	do {
-again:
-		while (1) {
-
-
-			work = get_next_work(worker, &prio_head, &head);
-			if (!work)
-				break;
-
-			list_del(&work->list);
-			clear_bit(WORK_QUEUED_BIT, &work->flags);
-
-			work->worker = worker;
-
-			work->func(work);
-
-			atomic_dec(&worker->num_pending);
-			/*
-			 * unless this is an ordered work queue,
-			 * 'work' was probably freed by func above.
-			 */
-			run_ordered_completions(worker->workers, work);
-
-			check_pending_worker_creates(worker);
-
-		}
-
-		spin_lock_irq(&worker->lock);
-		check_idle_worker(worker);
-
-		if (freezing(current)) {
-			worker->working = 0;
-			spin_unlock_irq(&worker->lock);
-			refrigerator();
-		} else {
-			spin_unlock_irq(&worker->lock);
-			if (!kthread_should_stop()) {
-				cpu_relax();
-				/*
-				 * we've dropped the lock, did someone else
-				 * jump_in?
-				 */
-				smp_mb();
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					continue;
-
-				/*
-				 * this short schedule allows more work to
-				 * come in without the queue functions
-				 * needing to go through wake_up_process()
-				 *
-				 * worker->working is still 1, so nobody
-				 * is going to try and wake us up
-				 */
-				schedule_timeout(1);
-				smp_mb();
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					continue;
-
-				if (kthread_should_stop())
-					break;
-
-				/* still no more work?, sleep for real */
-				spin_lock_irq(&worker->lock);
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending)) {
-					spin_unlock_irq(&worker->lock);
-					set_current_state(TASK_RUNNING);
-					goto again;
-				}
-
-				/*
-				 * this makes sure we get a wakeup when someone
-				 * adds something new to the queue
-				 */
-				worker->working = 0;
-				spin_unlock_irq(&worker->lock);
-
-				if (!kthread_should_stop()) {
-					schedule_timeout(HZ * 120);
-					if (!worker->working &&
-					    try_worker_shutdown(worker)) {
-						return 0;
-					}
-				}
-			}
-			__set_current_state(TASK_RUNNING);
-		}
-	} while (!kthread_should_stop());
-	return 0;
-}
-
-/*
- * this will wait for all the worker threads to shutdown
- */
-int btrfs_stop_workers(struct btrfs_workers *workers)
-{
-	struct list_head *cur;
-	struct btrfs_worker_thread *worker;
-	int can_stop;
-
-	spin_lock_irq(&workers->lock);
-	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while (!list_empty(&workers->worker_list)) {
-		cur = workers->worker_list.next;
-		worker = list_entry(cur, struct btrfs_worker_thread,
-				    worker_list);
-
-		atomic_inc(&worker->refs);
-		workers->num_workers -= 1;
-		if (!list_empty(&worker->worker_list)) {
-			list_del_init(&worker->worker_list);
-			put_worker(worker);
-			can_stop = 1;
-		} else
-			can_stop = 0;
-		spin_unlock_irq(&workers->lock);
-		if (can_stop)
-			kthread_stop(worker->task);
-		spin_lock_irq(&workers->lock);
-		put_worker(worker);
+	work = container_of(arg, struct btrfs_work, normal_work);
+	/*
+	 * We should not touch things inside work in the following cases:
+	 * 1) after work->func() if it has no ordered_free
+	 *    Since the struct is freed in work->func().
+	 * 2) after setting WORK_DONE_BIT
+	 *    The work may be freed in other threads almost instantly.
+	 * So we save the needed things here.
+	 */
+	if (work->ordered_func)
+		need_order = 1;
+	wq = work->wq;
+
+	trace_btrfs_work_sched(work);
+	thresh_exec_hook(wq);
+	work->func(work);
+	if (need_order) {
+		set_bit(WORK_DONE_BIT, &work->flags);
+		run_ordered_work(wq);
 	}
-	spin_unlock_irq(&workers->lock);
-	return 0;
+	if (!need_order)
+		trace_btrfs_all_work_done(work);
 }
 
-/*
- * simple init on struct btrfs_workers
- */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
-			struct btrfs_workers *async_helper)
+void btrfs_init_work(struct btrfs_work *work,
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free)
 {
-	workers->num_workers = 0;
-	workers->num_workers_starting = 0;
-	INIT_LIST_HEAD(&workers->worker_list);
-	INIT_LIST_HEAD(&workers->idle_list);
-	INIT_LIST_HEAD(&workers->order_list);
-	INIT_LIST_HEAD(&workers->prio_order_list);
-	spin_lock_init(&workers->lock);
-	spin_lock_init(&workers->order_lock);
-	workers->max_workers = max;
-	workers->idle_thresh = 32;
-	workers->name = name;
-	workers->ordered = 0;
-	workers->atomic_start_pending = 0;
-	workers->atomic_worker_start = async_helper;
+	work->func = func;
+	work->ordered_func = ordered_func;
+	work->ordered_free = ordered_free;
+	INIT_WORK(&work->normal_work, normal_work_helper);
+	INIT_LIST_HEAD(&work->ordered_list);
+	work->flags = 0;
 }
 
-/*
- * starts new worker threads.  This does not enforce the max worker
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers,
-				 int num_workers)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+				      struct btrfs_work *work)
 {
-	struct btrfs_worker_thread *worker;
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < num_workers; i++) {
-		worker = kzalloc(sizeof(*worker), GFP_NOFS);
-		if (!worker) {
-			ret = -ENOMEM;
-			goto fail;
-		}
+	unsigned long flags;
 
-		INIT_LIST_HEAD(&worker->pending);
-		INIT_LIST_HEAD(&worker->prio_pending);
-		INIT_LIST_HEAD(&worker->worker_list);
-		spin_lock_init(&worker->lock);
-
-		atomic_set(&worker->num_pending, 0);
-		atomic_set(&worker->refs, 1);
-		worker->workers = workers;
-		worker->task = kthread_run(worker_loop, worker,
-					   "btrfs-%s-%d", workers->name,
-					   workers->num_workers + i);
-		if (IS_ERR(worker->task)) {
-			ret = PTR_ERR(worker->task);
-			kfree(worker);
-			goto fail;
-		}
-		spin_lock_irq(&workers->lock);
-		list_add_tail(&worker->worker_list, &workers->idle_list);
-		worker->idle = 1;
-		workers->num_workers++;
-		workers->num_workers_starting--;
-		WARN_ON(workers->num_workers_starting < 0);
-		spin_unlock_irq(&workers->lock);
+	work->wq = wq;
+	thresh_queue_hook(wq);
+	if (work->ordered_func) {
+		spin_lock_irqsave(&wq->list_lock, flags);
+		list_add_tail(&work->ordered_list, &wq->ordered_list);
+		spin_unlock_irqrestore(&wq->list_lock, flags);
 	}
-	return 0;
-fail:
-	btrfs_stop_workers(workers);
-	return ret;
+	queue_work(wq->normal_wq, &work->normal_work);
+	trace_btrfs_work_queued(work);
 }
 
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work)
 {
-	spin_lock_irq(&workers->lock);
-	workers->num_workers_starting += num_workers;
-	spin_unlock_irq(&workers->lock);
-	return __btrfs_start_workers(workers, num_workers);
-}
+	struct __btrfs_workqueue *dest_wq;
 
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now.  This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
-{
-	struct btrfs_worker_thread *worker;
-	struct list_head *next;
-	int enforce_min;
-
-	enforce_min = (workers->num_workers + workers->num_workers_starting) <
-		workers->max_workers;
-
-	/*
-	 * if we find an idle thread, don't move it to the end of the
-	 * idle list.  This improves the chance that the next submission
-	 * will reuse the same thread, and maybe catch it while it is still
-	 * working
-	 */
-	if (!list_empty(&workers->idle_list)) {
-		next = workers->idle_list.next;
-		worker = list_entry(next, struct btrfs_worker_thread,
-				    worker_list);
-		return worker;
-	}
-	if (enforce_min || list_empty(&workers->worker_list))
-		return NULL;
-
-	/*
-	 * if we pick a busy task, move the task to the end of the list.
-	 * hopefully this will keep things somewhat evenly balanced.
-	 * Do the move in batches based on the sequence number.  This groups
-	 * requests submitted at roughly the same time onto the same worker.
-	 */
-	next = workers->worker_list.next;
-	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-	worker->sequence++;
-
-	if (worker->sequence % workers->idle_thresh == 0)
-		list_move_tail(next, &workers->worker_list);
-	return worker;
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+		dest_wq = wq->high;
+	else
+		dest_wq = wq->normal;
+	__btrfs_queue_work(dest_wq, work);
 }
 
-/*
- * selects a worker thread to take the next job.  This will either find
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
 {
-	struct btrfs_worker_thread *worker;
-	unsigned long flags;
-	struct list_head *fallback;
-
-again:
-	spin_lock_irqsave(&workers->lock, flags);
-	worker = next_worker(workers);
-
-	if (!worker) {
-		if (workers->num_workers + workers->num_workers_starting >=
-		    workers->max_workers) {
-			goto fallback;
-		} else if (workers->atomic_worker_start) {
-			workers->atomic_start_pending = 1;
-			goto fallback;
-		} else {
-			workers->num_workers_starting++;
-			spin_unlock_irqrestore(&workers->lock, flags);
-			/* we're below the limit, start another worker */
-			__btrfs_start_workers(workers, 1);
-			goto again;
-		}
-	}
-	goto found;
-
-fallback:
-	fallback = NULL;
-	/*
-	 * we have failed to find any workers, just
-	 * return the first one we can find.
-	 */
-	if (!list_empty(&workers->worker_list))
-		fallback = workers->worker_list.next;
-	if (!list_empty(&workers->idle_list))
-		fallback = workers->idle_list.next;
-	BUG_ON(!fallback);
-	worker = list_entry(fallback,
-		  struct btrfs_worker_thread, worker_list);
-found:
-	/*
-	 * this makes sure the worker doesn't exit before it is placed
-	 * onto a busy/idle list
-	 */
-	atomic_inc(&worker->num_pending);
-	spin_unlock_irqrestore(&workers->lock, flags);
-	return worker;
+	destroy_workqueue(wq->normal_wq);
+	trace_btrfs_workqueue_destroy(wq);
+	kfree(wq);
 }
 
-/*
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from.  It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-int btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
 {
-	struct btrfs_worker_thread *worker = work->worker;
-	unsigned long flags;
-	int wake = 0;
-
-	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		goto out;
-
-	spin_lock_irqsave(&worker->lock, flags);
-	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-		list_add_tail(&work->list, &worker->prio_pending);
-	else
-		list_add_tail(&work->list, &worker->pending);
-	atomic_inc(&worker->num_pending);
-
-	/* by definition we're busy, take ourselves off the idle
-	 * list
-	 */
-	if (worker->idle) {
-		spin_lock(&worker->workers->lock);
-		worker->idle = 0;
-		list_move_tail(&worker->worker_list,
-			      &worker->workers->worker_list);
-		spin_unlock(&worker->workers->lock);
-	}
-	if (!worker->working) {
-		wake = 1;
-		worker->working = 1;
-	}
-
-	if (wake)
-		wake_up_process(worker->task);
-	spin_unlock_irqrestore(&worker->lock, flags);
-out:
-
-	return 0;
+	if (!wq)
+		return;
+	if (wq->high)
+		__btrfs_destroy_workqueue(wq->high);
+	__btrfs_destroy_workqueue(wq->normal);
+	kfree(wq);
 }
 
-void btrfs_set_work_high_prio(struct btrfs_work *work)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
 {
-	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+	if (!wq)
+		return;
+	wq->normal->max_active = max;
+	if (wq->high)
+		wq->high->max_active = max;
 }
 
-/*
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_set_work_high_priority(struct btrfs_work *work)
 {
-	struct btrfs_worker_thread *worker;
-	unsigned long flags;
-	int wake = 0;
-
-	/* don't requeue something already on a list */
-	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		goto out;
-
-	worker = find_worker(workers);
-	if (workers->ordered) {
-		/*
-		 * you're not allowed to do ordered queues from an
-		 * interrupt handler
-		 */
-		spin_lock(&workers->order_lock);
-		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
-			list_add_tail(&work->order_list,
-				      &workers->prio_order_list);
-		} else {
-			list_add_tail(&work->order_list, &workers->order_list);
-		}
-		spin_unlock(&workers->order_lock);
-	} else {
-		INIT_LIST_HEAD(&work->order_list);
-	}
-
-	spin_lock_irqsave(&worker->lock, flags);
-
-	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-		list_add_tail(&work->list, &worker->prio_pending);
-	else
-		list_add_tail(&work->list, &worker->pending);
-	check_busy_worker(worker);
-
-	/*
-	 * avoid calling into wake_up_process if this thread has already
-	 * been kicked
-	 */
-	if (!worker->working)
-		wake = 1;
-	worker->working = 1;
-
-	if (wake)
-		wake_up_process(worker->task);
-	spin_unlock_irqrestore(&worker->lock, flags);
-
-out:
-	return 0;
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85..9c6b66d15fb 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -19,101 +20,35 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
 
-struct btrfs_worker_thread;
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
 
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work.  There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
 struct btrfs_work {
-	/*
-	 * func should be set to the function you want called
-	 * your work struct is passed as the only arg
-	 *
-	 * ordered_func must be set for work sent to an ordered work queue,
-	 * and it is called to complete a given work item in the same
-	 * order they were sent to the queue.
-	 */
-	void (*func)(struct btrfs_work *work);
-	void (*ordered_func)(struct btrfs_work *work);
-	void (*ordered_free)(struct btrfs_work *work);
-
-	/*
-	 * flags should be set to zero.  It is used to make sure the
-	 * struct is only inserted once into the list.
-	 */
+	btrfs_func_t func;
+	btrfs_func_t ordered_func;
+	btrfs_func_t ordered_free;
+
+	/* Don't touch things below */
+	struct work_struct normal_work;
+	struct list_head ordered_list;
+	struct __btrfs_workqueue *wq;
 	unsigned long flags;
-
-	/* don't touch these */
-	struct btrfs_worker_thread *worker;
-	struct list_head list;
-	struct list_head order_list;
-};
-
-struct btrfs_workers {
-	/* current number of running workers */
-	int num_workers;
-
-	int num_workers_starting;
-
-	/* max number of workers allowed.  changed by btrfs_start_workers */
-	int max_workers;
-
-	/* once a worker has this many requests or fewer, it is idle */
-	int idle_thresh;
-
-	/* force completions in the order they were queued */
-	int ordered;
-
-	/* more workers required, but in an interrupt handler */
-	int atomic_start_pending;
-
-	/*
-	 * are we allowed to sleep while starting workers or are we required
-	 * to start them at a later time?  If we can't sleep, this indicates
-	 * which queue we need to use to schedule thread creation.
-	 */
-	struct btrfs_workers *atomic_worker_start;
-
-	/* list with all the work threads.  The workers on the idle thread
-	 * may be actively servicing jobs, but they haven't yet hit the
-	 * idle thresh limit above.
-	 */
-	struct list_head worker_list;
-	struct list_head idle_list;
-
-	/*
-	 * when operating in ordered mode, this maintains the list
-	 * of work items waiting for completion
-	 */
-	struct list_head order_list;
-	struct list_head prio_order_list;
-
-	/* lock for finding the next worker thread to queue on */
-	spinlock_t lock;
-
-	/* lock for the ordered lists */
-	spinlock_t order_lock;
-
-	/* extra name for this worker, used for current->name */
-	char *name;
 };
 
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
-int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
-			struct btrfs_workers *async_starter);
-int btrfs_requeue_work(struct btrfs_work *work);
-void btrfs_set_work_high_prio(struct btrfs_work *work);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+					      int flags,
+					      int max_active,
+					      int thresh);
+void btrfs_init_work(struct btrfs_work *work,
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 00000000000..e25564bfcb4
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+#include "locking.h"
+
+struct extent_inode_elem {
+	u64 inum;
+	u64 offset;
+	struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+				struct btrfs_file_extent_item *fi,
+				u64 extent_item_pos,
+				struct extent_inode_elem **eie)
+{
+	u64 offset = 0;
+	struct extent_inode_elem *e;
+
+	if (!btrfs_file_extent_compression(eb, fi) &&
+	    !btrfs_file_extent_encryption(eb, fi) &&
+	    !btrfs_file_extent_other_encoding(eb, fi)) {
+		u64 data_offset;
+		u64 data_len;
+
+		data_offset = btrfs_file_extent_offset(eb, fi);
+		data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+		if (extent_item_pos < data_offset ||
+		    extent_item_pos >= data_offset + data_len)
+			return 1;
+		offset = extent_item_pos - data_offset;
+	}
+
+	e = kmalloc(sizeof(*e), GFP_NOFS);
+	if (!e)
+		return -ENOMEM;
+
+	e->next = *eie;
+	e->inum = key->objectid;
+	e->offset = key->offset + offset;
+	*eie = e;
+
+	return 0;
+}
+
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+	struct extent_inode_elem *eie_next;
+
+	for (; eie; eie = eie_next) {
+		eie_next = eie->next;
+		kfree(eie);
+	}
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+				u64 extent_item_pos,
+				struct extent_inode_elem **eie)
+{
+	u64 disk_byte;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int slot;
+	int nritems;
+	int extent_type;
+	int ret;
+
+	/*
+	 * from the shared data ref, we only have the leaf but we need
+	 * the key. thus, we must look into all items and see that we
+	 * find one (some) with a reference to our extent item.
+	 */
+	nritems = btrfs_header_nritems(eb);
+	for (slot = 0; slot < nritems; ++slot) {
+		btrfs_item_key_to_cpu(eb, &key, slot);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+		if (disk_byte != wanted_disk_byte)
+			continue;
+
+		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+	struct list_head list;
+	u64 root_id;
+	struct btrfs_key key_for_search;
+	int level;
+	int count;
+	struct extent_inode_elem *inode_list;
+	u64 parent;
+	u64 wanted_disk_byte;
+};
+
+static struct kmem_cache *btrfs_prelim_ref_cache;
+
+int __init btrfs_prelim_ref_init(void)
+{
+	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
+					sizeof(struct __prelim_ref),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_prelim_ref_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_prelim_ref_exit(void)
+{
+	if (btrfs_prelim_ref_cache)
+		kmem_cache_destroy(btrfs_prelim_ref_cache);
+}
+
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ *   block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    -   |     -
+ *      key to resolve |    -   |     y    |    y   |     y
+ *  tree block logical |    -   |     -    |    -   |     -
+ *  root for resolving |    y   |     y    |    y   |     y
+ *
+ * - column 1:       we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    y   |     -
+ *      key to resolve |    -   |     -    |    -   |     y
+ *  tree block logical |    y   |     y    |    y   |     y
+ *  root for resolving |    -   |     y    |    y   |     y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2:    we take the first key from the block to find the parent
+ *                (see __add_missing_keys)
+ * - column 4:    we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+			    struct btrfs_key *key, int level,
+			    u64 parent, u64 wanted_disk_byte, int count,
+			    gfp_t gfp_mask)
+{
+	struct __prelim_ref *ref;
+
+	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		return 0;
+
+	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->root_id = root_id;
+	if (key)
+		ref->key_for_search = *key;
+	else
+		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+
+	ref->inode_list = NULL;
+	ref->level = level;
+	ref->count = count;
+	ref->parent = parent;
+	ref->wanted_disk_byte = wanted_disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+			   struct ulist *parents, struct __prelim_ref *ref,
+			   int level, u64 time_seq, const u64 *extent_item_pos,
+			   u64 total_refs)
+{
+	int ret = 0;
+	int slot;
+	struct extent_buffer *eb;
+	struct btrfs_key key;
+	struct btrfs_key *key_for_search = &ref->key_for_search;
+	struct btrfs_file_extent_item *fi;
+	struct extent_inode_elem *eie = NULL, *old = NULL;
+	u64 disk_byte;
+	u64 wanted_disk_byte = ref->wanted_disk_byte;
+	u64 count = 0;
+
+	if (level != 0) {
+		eb = path->nodes[level];
+		ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+		if (ret < 0)
+			return ret;
+		return 0;
+	}
+
+	/*
+	 * We normally enter this function with the path already pointing to
+	 * the first item to check. But sometimes, we may enter it with
+	 * slot==nritems. In that case, go to the next leaf before we continue.
+	 */
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+		ret = btrfs_next_old_leaf(root, path, time_seq);
+
+	while (!ret && count < total_refs) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(eb, &key, slot);
+
+		if (key.objectid != key_for_search->objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+		if (disk_byte == wanted_disk_byte) {
+			eie = NULL;
+			old = NULL;
+			count++;
+			if (extent_item_pos) {
+				ret = check_extent_in_eb(&key, eb, fi,
+						*extent_item_pos,
+						&eie);
+				if (ret < 0)
+					break;
+			}
+			if (ret > 0)
+				goto next;
+			ret = ulist_add_merge(parents, eb->start,
+					      (uintptr_t)eie,
+					      (u64 *)&old, GFP_NOFS);
+			if (ret < 0)
+				break;
+			if (!ret && extent_item_pos) {
+				while (old->next)
+					old = old->next;
+				old->next = eie;
+			}
+			eie = NULL;
+		}
+next:
+		ret = btrfs_next_old_item(root, path, time_seq);
+	}
+
+	if (ret > 0)
+		ret = 0;
+	else if (ret < 0)
+		free_inode_elem_list(eie);
+	return ret;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+				  struct btrfs_path *path, u64 time_seq,
+				  struct __prelim_ref *ref,
+				  struct ulist *parents,
+				  const u64 *extent_item_pos, u64 total_refs)
+{
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	struct extent_buffer *eb;
+	int ret = 0;
+	int root_level;
+	int level = ref->level;
+	int index;
+
+	root_key.objectid = ref->root_id;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	if (path->search_commit_root)
+		root_level = btrfs_header_level(root->commit_root);
+	else
+		root_level = btrfs_old_root_level(root, time_seq);
+
+	if (root_level + 1 == level) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		goto out;
+	}
+
+	path->lowest_level = level;
+	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+
+	/* root node has been locked, we can release @subvol_srcu safely here */
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+		 "%d for key (%llu %u %llu)\n",
+		 ref->root_id, level, ref->count, ret,
+		 ref->key_for_search.objectid, ref->key_for_search.type,
+		 ref->key_for_search.offset);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[level];
+	while (!eb) {
+		if (WARN_ON(!level)) {
+			ret = 1;
+			goto out;
+		}
+		level--;
+		eb = path->nodes[level];
+	}
+
+	ret = add_all_parents(root, path, parents, ref, level, time_seq,
+			      extent_item_pos, total_refs);
+out:
+	path->lowest_level = 0;
+	btrfs_release_path(path);
+	return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+				   struct btrfs_path *path, u64 time_seq,
+				   struct list_head *head,
+				   const u64 *extent_item_pos, u64 total_refs)
+{
+	int err;
+	int ret = 0;
+	struct __prelim_ref *ref;
+	struct __prelim_ref *ref_safe;
+	struct __prelim_ref *new_ref;
+	struct ulist *parents;
+	struct ulist_node *node;
+	struct ulist_iterator uiter;
+
+	parents = ulist_alloc(GFP_NOFS);
+	if (!parents)
+		return -ENOMEM;
+
+	/*
+	 * _safe allows us to insert directly after the current item without
+	 * iterating over the newly inserted items.
+	 * we're also allowed to re-assign ref during iteration.
+	 */
+	list_for_each_entry_safe(ref, ref_safe, head, list) {
+		if (ref->parent)	/* already direct */
+			continue;
+		if (ref->count == 0)
+			continue;
+		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+					     parents, extent_item_pos,
+					     total_refs);
+		/*
+		 * we can only tolerate ENOENT,otherwise,we should catch error
+		 * and return directly.
+		 */
+		if (err == -ENOENT) {
+			continue;
+		} else if (err) {
+			ret = err;
+			goto out;
+		}
+
+		/* we put the first parent into the ref at hand */
+		ULIST_ITER_INIT(&uiter);
+		node = ulist_next(parents, &uiter);
+		ref->parent = node ? node->val : 0;
+		ref->inode_list = node ?
+			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
+
+		/* additional parents require new refs being added here */
+		while ((node = ulist_next(parents, &uiter))) {
+			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
+						   GFP_NOFS);
+			if (!new_ref) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			memcpy(new_ref, ref, sizeof(*ref));
+			new_ref->parent = node->val;
+			new_ref->inode_list = (struct extent_inode_elem *)
+							(uintptr_t)node->aux;
+			list_add(&new_ref->list, &ref->list);
+		}
+		ulist_reinit(parents);
+	}
+out:
+	ulist_free(parents);
+	return ret;
+}
+
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+				     struct __prelim_ref *ref2)
+{
+	if (ref1->level != ref2->level)
+		return 0;
+	if (ref1->root_id != ref2->root_id)
+		return 0;
+	if (ref1->key_for_search.type != ref2->key_for_search.type)
+		return 0;
+	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+		return 0;
+	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+		return 0;
+	if (ref1->parent != ref2->parent)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+			      struct list_head *head)
+{
+	struct list_head *pos;
+	struct extent_buffer *eb;
+
+	list_for_each(pos, head) {
+		struct __prelim_ref *ref;
+		ref = list_entry(pos, struct __prelim_ref, list);
+
+		if (ref->parent)
+			continue;
+		if (ref->key_for_search.type)
+			continue;
+		BUG_ON(!ref->wanted_disk_byte);
+		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+				     fs_info->tree_root->leafsize, 0);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
+		btrfs_tree_read_lock(eb);
+		if (btrfs_header_level(eb) == 0)
+			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+		else
+			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+		btrfs_tree_read_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return 0;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ *           additionally, we could even add a key range for the blocks we
+ *           looked into to merge even more (-> replace unresolved refs by those
+ *           having a parent).
+ * mode = 2: merge identical parents
+ */
+static void __merge_refs(struct list_head *head, int mode)
+{
+	struct list_head *pos1;
+
+	list_for_each(pos1, head) {
+		struct list_head *n2;
+		struct list_head *pos2;
+		struct __prelim_ref *ref1;
+
+		ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+		     pos2 = n2, n2 = pos2->next) {
+			struct __prelim_ref *ref2;
+			struct __prelim_ref *xchg;
+			struct extent_inode_elem *eie;
+
+			ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+			if (mode == 1) {
+				if (!ref_for_same_block(ref1, ref2))
+					continue;
+				if (!ref1->parent && ref2->parent) {
+					xchg = ref1;
+					ref1 = ref2;
+					ref2 = xchg;
+				}
+			} else {
+				if (ref1->parent != ref2->parent)
+					continue;
+			}
+
+			eie = ref1->inode_list;
+			while (eie && eie->next)
+				eie = eie->next;
+			if (eie)
+				eie->next = ref2->inode_list;
+			else
+				ref1->inode_list = ref2->inode_list;
+			ref1->count += ref2->count;
+
+			list_del(&ref2->list);
+			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+		}
+
+	}
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+			      struct list_head *prefs, u64 *total_refs)
+{
+	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+	struct rb_node *n = &head->node.rb_node;
+	struct btrfs_key key;
+	struct btrfs_key op_key = {0};
+	int sgn;
+	int ret = 0;
+
+	if (extent_op && extent_op->update_key)
+		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+
+	spin_lock(&head->lock);
+	n = rb_first(&head->ref_root);
+	while (n) {
+		struct btrfs_delayed_ref_node *node;
+		node = rb_entry(n, struct btrfs_delayed_ref_node,
+				rb_node);
+		n = rb_next(n);
+		if (node->seq > seq)
+			continue;
+
+		switch (node->action) {
+		case BTRFS_ADD_DELAYED_EXTENT:
+		case BTRFS_UPDATE_DELAYED_HEAD:
+			WARN_ON(1);
+			continue;
+		case BTRFS_ADD_DELAYED_REF:
+			sgn = 1;
+			break;
+		case BTRFS_DROP_DELAYED_REF:
+			sgn = -1;
+			break;
+		default:
+			BUG_ON(1);
+		}
+		*total_refs += (node->ref_mod * sgn);
+		switch (node->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, &op_key,
+					       ref->level + 1, 0, node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, NULL,
+					       ref->level + 1, ref->parent,
+					       node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+					       ref->parent, node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			break;
+	}
+	spin_unlock(&head->lock);
+	return ret;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path, u64 bytenr,
+			     int *info_level, struct list_head *prefs,
+			     u64 *total_refs)
+{
+	int ret = 0;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	unsigned long ptr;
+	unsigned long end;
+	struct btrfs_extent_item *ei;
+	u64 flags;
+	u64 item_size;
+
+	/*
+	 * enumerate all inline refs
+	 */
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+	*total_refs += btrfs_extent_refs(leaf, ei);
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+
+		info = (struct btrfs_tree_block_info *)ptr;
+		*info_level = btrfs_tree_block_level(leaf, info);
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+		*info_level = found_key.offset;
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		u64 offset;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+		switch (type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, NULL,
+						*info_level + 1, offset,
+						bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, offset, NULL,
+					       *info_level + 1, 0,
+					       bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+			    struct btrfs_path *path, u64 bytenr,
+			    int info_level, struct list_head *prefs)
+{
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	while (1) {
+		ret = btrfs_next_item(extent_root, path);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid != bytenr)
+			break;
+		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+			continue;
+		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+			break;
+
+		switch (key.type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, NULL,
+						info_level + 1, key.offset,
+						bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_shared_data_ref);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+						bytenr, count, GFP_NOFS);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, key.offset, NULL,
+					       info_level + 1, 0,
+					       bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_extent_data_ref);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+
+	}
+
+	return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info, u64 bytenr,
+			     u64 time_seq, struct ulist *refs,
+			     struct ulist *roots, const u64 *extent_item_pos)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_delayed_ref_root *delayed_refs = NULL;
+	struct btrfs_delayed_ref_head *head;
+	int info_level = 0;
+	int ret;
+	struct list_head prefs_delayed;
+	struct list_head prefs;
+	struct __prelim_ref *ref;
+	struct extent_inode_elem *eie = NULL;
+	u64 total_refs = 0;
+
+	INIT_LIST_HEAD(&prefs);
+	INIT_LIST_HEAD(&prefs_delayed);
+
+	key.objectid = bytenr;
+	key.offset = (u64)-1;
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	if (!trans) {
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+	}
+
+	/*
+	 * grab both a lock on the path and a lock on the delayed ref head.
+	 * We need both to get a consistent picture of how the refs look
+	 * at a specified point in time
+	 */
+again:
+	head = NULL;
+
+	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (trans && likely(trans->type != __TRANS_DUMMY)) {
+#else
+	if (trans) {
+#endif
+		/*
+		 * look if there are updates for this ref queued and lock the
+		 * head
+		 */
+		delayed_refs = &trans->transaction->delayed_refs;
+		spin_lock(&delayed_refs->lock);
+		head = btrfs_find_delayed_ref_head(trans, bytenr);
+		if (head) {
+			if (!mutex_trylock(&head->mutex)) {
+				atomic_inc(&head->node.refs);
+				spin_unlock(&delayed_refs->lock);
+
+				btrfs_release_path(path);
+
+				/*
+				 * Mutex was contended, block until it's
+				 * released and try again
+				 */
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+				btrfs_put_delayed_ref(&head->node);
+				goto again;
+			}
+			spin_unlock(&delayed_refs->lock);
+			ret = __add_delayed_refs(head, time_seq,
+						 &prefs_delayed, &total_refs);
+			mutex_unlock(&head->mutex);
+			if (ret)
+				goto out;
+		} else {
+			spin_unlock(&delayed_refs->lock);
+		}
+	}
+
+	if (path->slots[0]) {
+		struct extent_buffer *leaf;
+		int slot;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == bytenr &&
+		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		     key.type == BTRFS_METADATA_ITEM_KEY)) {
+			ret = __add_inline_refs(fs_info, path, bytenr,
+						&info_level, &prefs,
+						&total_refs);
+			if (ret)
+				goto out;
+			ret = __add_keyed_refs(fs_info, path, bytenr,
+					       info_level, &prefs);
+			if (ret)
+				goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	list_splice_init(&prefs_delayed, &prefs);
+
+	ret = __add_missing_keys(fs_info, &prefs);
+	if (ret)
+		goto out;
+
+	__merge_refs(&prefs, 1);
+
+	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+				      extent_item_pos, total_refs);
+	if (ret)
+		goto out;
+
+	__merge_refs(&prefs, 2);
+
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		WARN_ON(ref->count < 0);
+		if (roots && ref->count && ref->root_id && ref->parent == 0) {
+			/* no parent == root of tree */
+			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+			if (ret < 0)
+				goto out;
+		}
+		if (ref->count && ref->parent) {
+			if (extent_item_pos && !ref->inode_list &&
+			    ref->level == 0) {
+				u32 bsz;
+				struct extent_buffer *eb;
+				bsz = btrfs_level_size(fs_info->extent_root,
+							ref->level);
+				eb = read_tree_block(fs_info->extent_root,
+							   ref->parent, bsz, 0);
+				if (!eb || !extent_buffer_uptodate(eb)) {
+					free_extent_buffer(eb);
+					ret = -EIO;
+					goto out;
+				}
+				ret = find_extent_in_eb(eb, bytenr,
+							*extent_item_pos, &eie);
+				free_extent_buffer(eb);
+				if (ret < 0)
+					goto out;
+				ref->inode_list = eie;
+			}
+			ret = ulist_add_merge(refs, ref->parent,
+					      (uintptr_t)ref->inode_list,
+					      (u64 *)&eie, GFP_NOFS);
+			if (ret < 0)
+				goto out;
+			if (!ret && extent_item_pos) {
+				/*
+				 * we've recorded that parent, so we must extend
+				 * its inode list here
+				 */
+				BUG_ON(!eie);
+				while (eie->next)
+					eie = eie->next;
+				eie->next = ref->inode_list;
+			}
+			eie = NULL;
+		}
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+
+out:
+	btrfs_free_path(path);
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+	while (!list_empty(&prefs_delayed)) {
+		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+				       list);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+	if (ret < 0)
+		free_inode_elem_list(eie);
+	return ret;
+}
+
+static void free_leaf_list(struct ulist *blocks)
+{
+	struct ulist_node *node = NULL;
+	struct extent_inode_elem *eie;
+	struct ulist_iterator uiter;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((node = ulist_next(blocks, &uiter))) {
+		if (!node->aux)
+			continue;
+		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
+		free_inode_elem_list(eie);
+		node->aux = 0;
+	}
+
+	ulist_free(blocks);
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 time_seq, struct ulist **leafs,
+				const u64 *extent_item_pos)
+{
+	int ret;
+
+	*leafs = ulist_alloc(GFP_NOFS);
+	if (!*leafs)
+		return -ENOMEM;
+
+	ret = find_parent_nodes(trans, fs_info, bytenr,
+				time_seq, *leafs, NULL, extent_item_pos);
+	if (ret < 0 && ret != -ENOENT) {
+		free_leaf_list(*leafs);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info, u64 bytenr,
+				  u64 time_seq, struct ulist **roots)
+{
+	struct ulist *tmp;
+	struct ulist_node *node = NULL;
+	struct ulist_iterator uiter;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*roots = ulist_alloc(GFP_NOFS);
+	if (!*roots) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	ULIST_ITER_INIT(&uiter);
+	while (1) {
+		ret = find_parent_nodes(trans, fs_info, bytenr,
+					time_seq, tmp, *roots, NULL);
+		if (ret < 0 && ret != -ENOENT) {
+			ulist_free(tmp);
+			ulist_free(*roots);
+			return ret;
+		}
+		node = ulist_next(tmp, &uiter);
+		if (!node)
+			break;
+		bytenr = node->val;
+		cond_resched();
+	}
+
+	ulist_free(tmp);
+	return 0;
+}
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 bytenr,
+			 u64 time_seq, struct ulist **roots)
+{
+	int ret;
+
+	if (!trans)
+		down_read(&fs_info->commit_root_sem);
+	ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+	if (!trans)
+		up_read(&fs_info->commit_root_sem);
+	return ret;
+}
+
+/*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+			struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	return btrfs_find_item(fs_root, path, inum, ioff,
+			BTRFS_INODE_ITEM_KEY, &key);
+}
+
+static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+				struct btrfs_path *path,
+				struct btrfs_key *found_key)
+{
+	return btrfs_find_item(fs_root, path, inum, ioff,
+			BTRFS_INODE_REF_KEY, found_key);
+}
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off)
+{
+	int ret, slot;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = start_off;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * If the item at offset is not found,
+			 * btrfs_search_slot will point us to the slot
+			 * where it should be inserted. In our case
+			 * that will be the slot directly before the
+			 * next INODE_REF_KEY_V2 item. In the case
+			 * that we're pointing to the last slot in a
+			 * leaf, we must move one leaf over.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret) {
+				if (ret >= 1)
+					ret = -ENOENT;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/*
+		 * Check that we're still looking at an extended ref key for
+		 * this particular objectid. If we have different
+		 * objectid or type then there are no more to be found
+		 * in the tree and we can exit.
+		 */
+		ret = -ENOENT;
+		if (found_key.objectid != inode_objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		ret = 0;
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		extref = (struct btrfs_inode_extref *)ptr;
+		*ret_extref = extref;
+		if (found_off)
+			*found_off = found_key.offset;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size)
+{
+	int slot;
+	u64 next_inum;
+	int ret;
+	s64 bytes_left = ((s64)size) - 1;
+	struct extent_buffer *eb = eb_in;
+	struct btrfs_key found_key;
+	int leave_spinning = path->leave_spinning;
+	struct btrfs_inode_ref *iref;
+
+	if (bytes_left >= 0)
+		dest[bytes_left] = '\0';
+
+	path->leave_spinning = 1;
+	while (1) {
+		bytes_left -= name_len;
+		if (bytes_left >= 0)
+			read_extent_buffer(eb, dest + bytes_left,
+					   name_off, name_len);
+		if (eb != eb_in) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+		}
+		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+		if (ret > 0)
+			ret = -ENOENT;
+		if (ret)
+			break;
+
+		next_inum = found_key.offset;
+
+		/* regular exit ahead */
+		if (parent == next_inum)
+			break;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		if (eb != eb_in) {
+			atomic_inc(&eb->refs);
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		}
+		btrfs_release_path(path);
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		name_len = btrfs_inode_ref_name_len(eb, iref);
+		name_off = (unsigned long)(iref + 1);
+
+		parent = next_inum;
+		--bytes_left;
+		if (bytes_left >= 0)
+			dest[bytes_left] = '/';
+	}
+
+	btrfs_release_path(path);
+	path->leave_spinning = leave_spinning;
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	return dest + bytes_left;
+}
+
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags_ret)
+{
+	int ret;
+	u64 flags;
+	u64 size = 0;
+	u32 item_size;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.objectid = logical;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		return ret;
+	}
+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+		size = fs_info->extent_root->leafsize;
+	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+		size = found_key->offset;
+
+	if (found_key->objectid > logical ||
+	    found_key->objectid + size <= logical) {
+		pr_debug("logical %llu is not within any extent\n", logical);
+		return -ENOENT;
+	}
+
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	pr_debug("logical %llu is at position %llu within the extent (%llu "
+		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
+		 logical, logical - found_key->objectid, found_key->objectid,
+		 found_key->offset, flags, item_size);
+
+	WARN_ON(!flags_ret);
+	if (flags_ret) {
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+		else if (flags & BTRFS_EXTENT_FLAG_DATA)
+			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
+		else
+			BUG_ON(1);
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
+				   struct btrfs_key *key,
+				   struct btrfs_extent_item *ei, u32 item_size,
+				   struct btrfs_extent_inline_ref **out_eiref,
+				   int *out_type)
+{
+	unsigned long end;
+	u64 flags;
+	struct btrfs_tree_block_info *info;
+
+	if (!*ptr) {
+		/* first call */
+		flags = btrfs_extent_flags(eb, ei);
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			if (key->type == BTRFS_METADATA_ITEM_KEY) {
+				/* a skinny metadata extent */
+				*out_eiref =
+				     (struct btrfs_extent_inline_ref *)(ei + 1);
+			} else {
+				WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+				info = (struct btrfs_tree_block_info *)(ei + 1);
+				*out_eiref =
+				   (struct btrfs_extent_inline_ref *)(info + 1);
+			}
+		} else {
+			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+		}
+		*ptr = (unsigned long)*out_eiref;
+		if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
+			return -ENOENT;
+	}
+
+	end = (unsigned long)ei + item_size;
+	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
+	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+
+	*ptr += btrfs_extent_inline_ref_size(*out_type);
+	WARN_ON(*ptr > end);
+	if (*ptr == end)
+		return 1; /* last */
+
+	return 0;
+}
+
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see __get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+			    struct btrfs_key *key, struct btrfs_extent_item *ei,
+			    u32 item_size, u64 *out_root, u8 *out_level)
+{
+	int ret;
+	int type;
+	struct btrfs_tree_block_info *info;
+	struct btrfs_extent_inline_ref *eiref;
+
+	if (*ptr == (unsigned long)-1)
+		return 1;
+
+	while (1) {
+		ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+					      &eiref, &type);
+		if (ret < 0)
+			return ret;
+
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY)
+			break;
+
+		if (ret == 1)
+			return 1;
+	}
+
+	/* we can treat both ref types equally here */
+	info = (struct btrfs_tree_block_info *)(ei + 1);
+	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+	*out_level = btrfs_tree_block_level(eb, info);
+
+	if (ret == 1)
+		*ptr = (unsigned long)-1;
+
+	return 0;
+}
+
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+				u64 root, u64 extent_item_objectid,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	struct extent_inode_elem *eie;
+	int ret = 0;
+
+	for (eie = inode_list; eie; eie = eie->next) {
+		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+			 "root %llu\n", extent_item_objectid,
+			 eie->inum, eie->offset, root);
+		ret = iterate(eie->inum, eie->offset, root, ctx);
+		if (ret) {
+			pr_debug("stopping iteration for %llu due to ret=%d\n",
+				 extent_item_objectid, ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				u64 extent_item_objectid, u64 extent_item_pos,
+				int search_commit_root,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct ulist *refs = NULL;
+	struct ulist *roots = NULL;
+	struct ulist_node *ref_node = NULL;
+	struct ulist_node *root_node = NULL;
+	struct seq_list tree_mod_seq_elem = {};
+	struct ulist_iterator ref_uiter;
+	struct ulist_iterator root_uiter;
+
+	pr_debug("resolving all inodes for extent %llu\n",
+			extent_item_objectid);
+
+	if (!search_commit_root) {
+		trans = btrfs_join_transaction(fs_info->extent_root);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	} else {
+		down_read(&fs_info->commit_root_sem);
+	}
+
+	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+				   tree_mod_seq_elem.seq, &refs,
+				   &extent_item_pos);
+	if (ret)
+		goto out;
+
+	ULIST_ITER_INIT(&ref_uiter);
+	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+		ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+					     tree_mod_seq_elem.seq, &roots);
+		if (ret)
+			break;
+		ULIST_ITER_INIT(&root_uiter);
+		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+			pr_debug("root %llu references leaf %llu, data list "
+				 "%#llx\n", root_node->val, ref_node->val,
+				 ref_node->aux);
+			ret = iterate_leaf_refs((struct extent_inode_elem *)
+						(uintptr_t)ref_node->aux,
+						root_node->val,
+						extent_item_objectid,
+						iterate, ctx);
+		}
+		ulist_free(roots);
+	}
+
+	free_leaf_list(refs);
+out:
+	if (!search_commit_root) {
+		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+		btrfs_end_transaction(trans, fs_info->extent_root);
+	} else {
+		up_read(&fs_info->commit_root_sem);
+	}
+
+	return ret;
+}
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	int ret;
+	u64 extent_item_pos;
+	u64 flags = 0;
+	struct btrfs_key found_key;
+	int search_commit_root = path->search_commit_root;
+
+	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
+	btrfs_release_path(path);
+	if (ret < 0)
+		return ret;
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		return -EINVAL;
+
+	extent_item_pos = logical - found_key.objectid;
+	ret = iterate_extent_inodes(fs_info, found_key.objectid,
+					extent_item_pos, search_commit_root,
+					iterate, ctx);
+
+	return ret;
+}
+
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+			      struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+			      struct btrfs_path *path,
+			      iterate_irefs_t *iterate, void *ctx)
+{
+	int ret = 0;
+	int slot;
+	u32 cur;
+	u32 len;
+	u32 name_len;
+	u64 parent = 0;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_key found_key;
+
+	while (!ret) {
+		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
+				     &found_key);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		parent = found_key.offset;
+		slot = path->slots[0];
+		eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!eb) {
+			ret = -ENOMEM;
+			break;
+		}
+		extent_buffer_get(eb);
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		item = btrfs_item_nr(slot);
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			/* path must be released before calling iterate()! */
+			pr_debug("following ref at offset %u for inode %llu in "
+				 "tree %llu\n", cur, found_key.objectid,
+				 fs_root->objectid);
+			ret = iterate(parent, name_len,
+				      (unsigned long)(iref + 1), eb, ctx);
+			if (ret)
+				break;
+			len = sizeof(*iref) + name_len;
+			iref = (struct btrfs_inode_ref *)((char *)iref + len);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+				 struct btrfs_path *path,
+				 iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u64 offset = 0;
+	u64 parent;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	u32 cur_offset;
+	unsigned long ptr;
+
+	while (1) {
+		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+					    &offset);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		slot = path->slots[0];
+		eb = btrfs_clone_extent_buffer(path->nodes[0]);
+		if (!eb) {
+			ret = -ENOMEM;
+			break;
+		}
+		extent_buffer_get(eb);
+
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, slot);
+		ptr = btrfs_item_ptr_offset(leaf, slot);
+		cur_offset = 0;
+
+		while (cur_offset < item_size) {
+			u32 name_len;
+
+			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+			parent = btrfs_inode_extref_parent(eb, extref);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			ret = iterate(parent, name_len,
+				      (unsigned long)&extref->name, eb, ctx);
+			if (ret)
+				break;
+
+			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += sizeof(*extref);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+
+		offset++;
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+			 struct btrfs_path *path, iterate_irefs_t *iterate,
+			 void *ctx)
+{
+	int ret;
+	int found_refs = 0;
+
+	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+	if (!ret)
+		++found_refs;
+	else if (ret != -ENOENT)
+		return ret;
+
+	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+	if (ret == -ENOENT && found_refs)
+		return 0;
+
+	return ret;
+}
+
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+			 struct extent_buffer *eb, void *ctx)
+{
+	struct inode_fs_paths *ipath = ctx;
+	char *fspath;
+	char *fspath_min;
+	int i = ipath->fspath->elem_cnt;
+	const int s_ptr = sizeof(char *);
+	u32 bytes_left;
+
+	bytes_left = ipath->fspath->bytes_left > s_ptr ?
+					ipath->fspath->bytes_left - s_ptr : 0;
+
+	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
+	fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+				   name_off, eb, inum, fspath_min, bytes_left);
+	if (IS_ERR(fspath))
+		return PTR_ERR(fspath);
+
+	if (fspath > fspath_min) {
+		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
+		++ipath->fspath->elem_cnt;
+		ipath->fspath->bytes_left = fspath - fspath_min;
+	} else {
+		++ipath->fspath->elem_missed;
+		ipath->fspath->bytes_missing += fspath_min - fspath;
+		ipath->fspath->bytes_left = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->val[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
+			     inode_to_path, ipath);
+}
+
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+	struct btrfs_data_container *data;
+	size_t alloc_bytes;
+
+	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+	data = vmalloc(alloc_bytes);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (total_bytes >= sizeof(*data)) {
+		data->bytes_left = total_bytes - sizeof(*data);
+		data->bytes_missing = 0;
+	} else {
+		data->bytes_missing = sizeof(*data) - total_bytes;
+		data->bytes_left = 0;
+	}
+
+	data->elem_cnt = 0;
+	data->elem_missed = 0;
+
+	return data;
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path)
+{
+	struct inode_fs_paths *ifp;
+	struct btrfs_data_container *fspath;
+
+	fspath = init_data_container(total_bytes);
+	if (IS_ERR(fspath))
+		return (void *)fspath;
+
+	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+	if (!ifp) {
+		kfree(fspath);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ifp->btrfs_path = path;
+	ifp->fspath = fspath;
+	ifp->fs_root = fs_root;
+
+	return ifp;
+}
+
+void free_ipath(struct inode_fs_paths *ipath)
+{
+	if (!ipath)
+		return;
+	vfree(ipath->fspath);
+	kfree(ipath);
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 00000000000..86fc20fec28
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_BACKREF__
+#define __BTRFS_BACKREF__
+
+#include <linux/btrfs.h>
+#include "ulist.h"
+#include "extent_io.h"
+
+struct inode_fs_paths {
+	struct btrfs_path		*btrfs_path;
+	struct btrfs_root		*fs_root;
+	struct btrfs_data_container	*fspath;
+};
+
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+		void *ctx);
+
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+			struct btrfs_path *path);
+
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags);
+
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+			    struct btrfs_key *key, struct btrfs_extent_item *ei,
+			    u32 item_size, u64 *out_root, u8 *out_level);
+
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				u64 extent_item_objectid,
+				u64 extent_offset, int search_commit_root,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 bytenr,
+			 u64 time_seq, struct ulist **roots);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size);
+
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off);
+
+int __init btrfs_prelim_ref_init(void);
+void btrfs_prelim_ref_exit(void);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ccc991c542d..4794923c410 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,9 +19,31 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+#include <linux/hash.h>
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
+#include "delayed-inode.h"
+
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE		0
+#define BTRFS_INODE_ORPHAN_META_RESERVED	1
+#define BTRFS_INODE_DUMMY			2
+#define BTRFS_INODE_IN_DEFRAG			3
+#define BTRFS_INODE_DELALLOC_META_RESERVED	4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
+#define BTRFS_INODE_NEEDS_FULL_SYNC		7
+#define BTRFS_INODE_COPY_EVERYTHING		8
+#define BTRFS_INODE_IN_DELALLOC_LIST		9
+#define BTRFS_INODE_READDIO_NEED_LOCK		10
+#define BTRFS_INODE_HAS_PROPS		        11
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -33,6 +55,9 @@ struct btrfs_inode {
 	 */
 	struct btrfs_key location;
 
+	/* Lock for counters */
+	spinlock_t lock;
+
 	/* the extent_tree has caches of all the extent mappings to disk */
 	struct extent_map_tree extent_tree;
 
@@ -47,12 +72,12 @@ struct btrfs_inode {
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
+	/* held while doing delalloc reservations */
+	struct mutex delalloc_mutex;
+
 	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
-	/* for keeping track of orphaned inodes */
-	struct list_head i_orphan;
-
 	/* list of all the delalloc inodes in the FS.  There are times we need
 	 * to write all the delalloc pages to disk, and this list is used
 	 * to walk them all.
@@ -68,42 +93,39 @@ struct btrfs_inode {
 	/* node for the red-black tree that links inodes in subvolume root */
 	struct rb_node rb_node;
 
-	/* the space_info for where this inode's data allocations are done */
-	struct btrfs_space_info *space_info;
+	unsigned long runtime_flags;
+
+	/* Keep track of who's O_SYNC/fsyncing currently */
+	atomic_t sync_writers;
 
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
 	u64 generation;
 
-	/* sequence number for NFS changes */
-	u64 sequence;
-
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
 
 	/*
-	 * log transid when this inode was last modified
+	 * transid that last logged this inode
 	 */
-	u64 last_sub_trans;
+	u64 logged_trans;
 
 	/*
-	 * transid that last logged this inode
+	 * log transid when this inode was last modified
 	 */
-	u64 logged_trans;
+	int last_sub_trans;
+
+	/* a local copy of root's last_log_commit */
+	int last_log_commit;
 
 	/* total number of bytes pending delalloc, used by stat to calc the
 	 * real block usage of the file
 	 */
 	u64 delalloc_bytes;
 
-	/* total number of bytes that may be used for this inode for
-	 * delalloc
-	 */
-	u64 reserved_bytes;
-
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
@@ -111,17 +133,14 @@ struct btrfs_inode {
 	 */
 	u64 disk_i_size;
 
-	/* flags field from the on disk inode */
-	u32 flags;
-
 	/*
 	 * if this is a directory then index_cnt is the counter for the index
 	 * number for new files that are created
 	 */
 	u64 index_cnt;
 
-	/* the start of block group preferred for allocations. */
-	u64 block_group;
+	/* Cache the directory index number to speed the dir/file remove */
+	u64 dir_index;
 
 	/* the fsync log has some corner cases that mean we have to check
 	 * directories to see if any unlinks have been done before
@@ -131,46 +150,140 @@ struct btrfs_inode {
 	u64 last_unlink_trans;
 
 	/*
+	 * Number of bytes outstanding that are going to need csums.  This is
+	 * used in ENOSPC accounting.
+	 */
+	u64 csum_bytes;
+
+	/* flags field from the on disk inode */
+	u32 flags;
+
+	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
 	 * items we think we'll end up using, and reserved_extents is the number
 	 * of extent items we've reserved metadata for.
 	 */
-	spinlock_t accounting_lock;
-	atomic_t outstanding_extents;
-	int reserved_extents;
-
-	/*
-	 * ordered_data_close is set by truncate when a file that used
-	 * to have good data has been truncated to zero.  When it is set
-	 * the btrfs file release call will add this inode to the
-	 * ordered operations list so that we make sure to flush out any
-	 * new data the application may have written before commit.
-	 *
-	 * yes, its silly to have a single bitflag, but we might grow more
-	 * of these.
-	 */
-	unsigned ordered_data_close:1;
-	unsigned orphan_meta_reserved:1;
-	unsigned dummy_inode:1;
+	unsigned outstanding_extents;
+	unsigned reserved_extents;
 
 	/*
 	 * always compress this one file
 	 */
-	unsigned force_compress:4;
+	unsigned force_compress;
+
+	struct btrfs_delayed_node *delayed_node;
 
 	struct inode vfs_inode;
 };
 
+extern unsigned char btrfs_filetype_table[];
+
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
 
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+					     const struct btrfs_root *root)
+{
+	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+	h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+	return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+	unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+	__insert_inode_hash(inode, h);
+}
+
+static inline u64 btrfs_ino(struct inode *inode)
+{
+	u64 ino = BTRFS_I(inode)->location.objectid;
+
+	/*
+	 * !ino: btree_inode
+	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
+	 */
+	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
+		ino = inode->i_ino;
+	return ino;
+}
+
 static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 {
 	i_size_write(inode, size);
 	BTRFS_I(inode)->disk_i_size = size;
 }
 
+static inline bool btrfs_is_free_space_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (root == root->fs_info->tree_root &&
+	    btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
+		return true;
+	if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+		return true;
+	return false;
+}
+
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+	if (BTRFS_I(inode)->logged_trans == generation &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->last_log_commit &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->root->last_log_commit)
+		return 1;
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	void *private;
+
+	/* number of bios pending for this dio */
+	atomic_t pending_bios;
+
+	/* IO errors */
+	int errors;
+
+	/* orig_bio is our btrfs_io_bio */
+	struct bio *orig_bio;
+
+	/* dio_bio came from fs/direct-io.c */
+	struct bio *dio_bio;
+	u8 csum[0];
+};
+
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+	smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb__before_atomic();
+	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+		  &BTRFS_I(inode)->runtime_flags);
+}
+
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+
 #endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 00000000000..ce92ae30250
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3295 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and no write error was indicated and a
+ *        FLUSH request to the device where these blocks are
+ *        located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ *       int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ *       range 12 30
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
+							 * excluding " [...]" */
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE		0x00002000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+	u32 magic_num;		/* only used for debug purposes */
+	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
+	unsigned int is_superblock:1;	/* if it is one of the superblocks */
+	unsigned int is_iodone:1;	/* if is done by lower subsystem */
+	unsigned int iodone_w_error:1;	/* error was indicated to endio */
+	unsigned int never_written:1;	/* block was added because it was
+					 * referenced, not because it was
+					 * written */
+	unsigned int mirror_num;	/* large enough to hold
+					 * BTRFS_SUPER_MIRROR_MAX */
+	struct btrfsic_dev_state *dev_state;
+	u64 dev_bytenr;		/* key, physical byte num on disk */
+	u64 logical_bytenr;	/* logical byte num on disk */
+	u64 generation;
+	struct btrfs_disk_key disk_key;	/* extra info to print in case of
+					 * issues, will not always be correct */
+	struct list_head collision_resolving_node;	/* list node */
+	struct list_head all_blocks_node;	/* list node */
+
+	/* the following two lists contain block_link items */
+	struct list_head ref_to_list;	/* list */
+	struct list_head ref_from_list;	/* list */
+	struct btrfsic_block *next_in_same_bio;
+	void *orig_bio_bh_private;
+	union {
+		bio_end_io_t *bio;
+		bh_end_io_t *bh;
+	} orig_bio_bh_end_io;
+	int submit_bio_bh_rw;
+	u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+	u32 magic_num;		/* only used for debug purposes */
+	u32 ref_cnt;
+	struct list_head node_ref_to;	/* list node */
+	struct list_head node_ref_from;	/* list node */
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block *block_ref_to;
+	struct btrfsic_block *block_ref_from;
+	u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+	u32 magic_num;		/* only used for debug purposes */
+	struct block_device *bdev;
+	struct btrfsic_state *state;
+	struct list_head collision_resolving_node;	/* list node */
+	struct btrfsic_block dummy_block_for_bio_bh_flush;
+	u64 last_flush_gen;
+	char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+	u64 start;		/* virtual bytenr */
+	u64 dev_bytenr;		/* physical bytenr on device */
+	u32 len;
+	struct btrfsic_dev_state *dev;
+	char **datav;
+	struct page **pagev;
+	void *mem_to_free;
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+	u32 magic;
+	u32 nr;
+	int error;
+	int i;
+	int limit_nesting;
+	int num_copies;
+	int mirror_num;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx *block_ctx;
+	struct btrfsic_block *next_block;
+	struct btrfsic_block_data_ctx next_block_ctx;
+	struct btrfs_header *hdr;
+	struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+	u32 print_mask;
+	int include_extent_data;
+	int csum_size;
+	struct list_head all_blocks_list;
+	struct btrfsic_block_hashtable block_hashtable;
+	struct btrfsic_block_link_hashtable block_link_hashtable;
+	struct btrfs_root *root;
+	u64 max_superblock_generation;
+	struct btrfsic_block *latest_superblock;
+	u32 metablock_size;
+	u32 datablock_size;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+				     struct btrfsic_block *block,
+				     struct btrfsic_block_data_ctx *block_ctx,
+				     int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+	struct btrfsic_block_data_ctx *block_ctx,
+	void *dst, u32 offset, size_t len);
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx
+		*block_ctx, u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+				      struct btrfsic_block *block,
+				      struct btrfsic_block_data_ctx *block_ctx,
+				      u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+				  u32 len, struct block_device *bdev,
+				  struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     char **datav, unsigned int num_pages);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr, char **mapped_datav,
+					  unsigned int num_pages,
+					  struct bio *bio, int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const block,
+		struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+					      const struct btrfsic_block *block,
+					      int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+	b->dev_state = NULL;
+	b->dev_bytenr = 0;
+	b->logical_bytenr = 0;
+	b->generation = BTRFSIC_GENERATION_UNKNOWN;
+	b->disk_key.objectid = 0;
+	b->disk_key.type = 0;
+	b->disk_key.offset = 0;
+	b->is_metadata = 0;
+	b->is_superblock = 0;
+	b->is_iodone = 0;
+	b->iodone_w_error = 0;
+	b->never_written = 0;
+	b->mirror_num = 0;
+	b->next_in_same_bio = NULL;
+	b->orig_bio_bh_private = NULL;
+	b->orig_bio_bh_end_io.bio = NULL;
+	INIT_LIST_HEAD(&b->collision_resolving_node);
+	INIT_LIST_HEAD(&b->all_blocks_node);
+	INIT_LIST_HEAD(&b->ref_to_list);
+	INIT_LIST_HEAD(&b->ref_from_list);
+	b->submit_bio_bh_rw = 0;
+	b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+	struct btrfsic_block *b;
+
+	b = kzalloc(sizeof(*b), GFP_NOFS);
+	if (NULL != b)
+		btrfsic_block_init(b);
+
+	return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+	kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+	l->ref_cnt = 1;
+	INIT_LIST_HEAD(&l->node_ref_to);
+	INIT_LIST_HEAD(&l->node_ref_from);
+	INIT_LIST_HEAD(&l->collision_resolving_node);
+	l->block_ref_to = NULL;
+	l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+	struct btrfsic_block_link *l;
+
+	l = kzalloc(sizeof(*l), GFP_NOFS);
+	if (NULL != l)
+		btrfsic_block_link_init(l);
+
+	return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+	kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+	ds->bdev = NULL;
+	ds->state = NULL;
+	ds->name[0] = '\0';
+	INIT_LIST_HEAD(&ds->collision_resolving_node);
+	ds->last_flush_gen = 0;
+	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = kzalloc(sizeof(*ds), GFP_NOFS);
+	if (NULL != ds)
+		btrfsic_dev_state_init(ds);
+
+	return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+	BUG_ON(!(NULL == ds ||
+		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+	kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+					struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(b->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+	list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+	list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+		struct block_device *bdev,
+		u64 dev_bytenr,
+		struct btrfsic_block_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev))) &
+	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block *const b =
+		    list_entry(elem, struct btrfsic_block,
+			       collision_resolving_node);
+
+		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+			return b;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+		struct btrfsic_block_link_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+		struct btrfsic_block_link *l,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+	BUG_ON(NULL == l->block_ref_to);
+	BUG_ON(NULL == l->block_ref_from);
+	list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+	list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+		struct block_device *bdev_ref_to,
+		u64 dev_bytenr_ref_to,
+		struct block_device *bdev_ref_from,
+		u64 dev_bytenr_ref_from,
+		struct btrfsic_block_link_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
+	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_block_link *const l =
+		    list_entry(elem, struct btrfsic_block_link,
+			       collision_resolving_node);
+
+		BUG_ON(NULL == l->block_ref_to);
+		BUG_ON(NULL == l->block_ref_from);
+		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+			return l;
+	}
+
+	return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+		struct btrfsic_dev_state_hashtable *h)
+{
+	int i;
+
+	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+		INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+		struct btrfsic_dev_state *ds,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)ds->bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+	list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+	list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+		struct block_device *bdev,
+		struct btrfsic_dev_state_hashtable *h)
+{
+	const unsigned int hashval =
+	    (((unsigned int)((uintptr_t)bdev)) &
+	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+	struct list_head *elem;
+
+	list_for_each(elem, h->table + hashval) {
+		struct btrfsic_dev_state *const ds =
+		    list_entry(elem, struct btrfsic_dev_state,
+			       collision_resolving_node);
+
+		if (ds->bdev == bdev)
+			return ds;
+	}
+
+	return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+				      struct btrfs_fs_devices *fs_devices)
+{
+	int ret = 0;
+	struct btrfs_super_block *selected_super;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+	struct btrfsic_dev_state *selected_dev_state = NULL;
+	int pass;
+
+	BUG_ON(NULL == state);
+	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
+	if (NULL == selected_super) {
+		printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+		return -1;
+	}
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		int i;
+		struct btrfsic_dev_state *dev_state;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		dev_state = btrfsic_dev_state_lookup(device->bdev);
+		BUG_ON(NULL == dev_state);
+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			ret = btrfsic_process_superblock_dev_mirror(
+					state, dev_state, device, i,
+					&selected_dev_state, selected_super);
+			if (0 != ret && 0 == i) {
+				kfree(selected_super);
+				return ret;
+			}
+		}
+	}
+
+	if (NULL == state->latest_superblock) {
+		printk(KERN_INFO "btrfsic: no superblock found!\n");
+		kfree(selected_super);
+		return -1;
+	}
+
+	state->csum_size = btrfs_super_csum_size(selected_super);
+
+	for (pass = 0; pass < 3; pass++) {
+		int num_copies;
+		int mirror_num;
+		u64 next_bytenr;
+
+		switch (pass) {
+		case 0:
+			next_bytenr = btrfs_super_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n", next_bytenr);
+			break;
+		case 1:
+			next_bytenr = btrfs_super_chunk_root(selected_super);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n", next_bytenr);
+			break;
+		case 2:
+			next_bytenr = btrfs_super_log_root(selected_super);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n", next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->metablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+
+			ret = btrfsic_map_block(state, next_bytenr,
+						state->metablock_size,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO "btrfsic:"
+				       " btrfsic_map_block(root @%llu,"
+				       " mirror %d) failed!\n",
+				       next_bytenr, mirror_num);
+				kfree(selected_super);
+				return -1;
+			}
+
+			next_block = btrfsic_block_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					&state->block_hashtable);
+			BUG_ON(NULL == next_block);
+
+			l = btrfsic_block_link_hashtable_lookup(
+					tmp_next_block_ctx.dev->bdev,
+					tmp_next_block_ctx.dev_bytenr,
+					state->latest_superblock->dev_state->
+					bdev,
+					state->latest_superblock->dev_bytenr,
+					&state->block_link_hashtable);
+			BUG_ON(NULL == l);
+
+			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+			if (ret < (int)PAGE_CACHE_SIZE) {
+				printk(KERN_INFO
+				       "btrfsic: read @logical %llu failed!\n",
+				       tmp_next_block_ctx.start);
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				kfree(selected_super);
+				return -1;
+			}
+
+			ret = btrfsic_process_metablock(state,
+							next_block,
+							&tmp_next_block_ctx,
+							BTRFS_MAX_LEVEL + 3, 1);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+		}
+	}
+
+	kfree(selected_super);
+	return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+		struct btrfsic_state *state,
+		struct btrfsic_dev_state *dev_state,
+		struct btrfs_device *device,
+		int superblock_mirror_num,
+		struct btrfsic_dev_state **selected_dev_state,
+		struct btrfs_super_block *selected_super)
+{
+	struct btrfs_super_block *super_tmp;
+	u64 dev_bytenr;
+	struct buffer_head *bh;
+	struct btrfsic_block *superblock_tmp;
+	int pass;
+	struct block_device *const superblock_bdev = device->bdev;
+
+	/* super block bytenr is always the unmapped device bytenr */
+	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+		return -1;
+	bh = __bread(superblock_bdev, dev_bytenr / 4096,
+		     BTRFS_SUPER_INFO_SIZE);
+	if (NULL == bh)
+		return -1;
+	super_tmp = (struct btrfs_super_block *)
+	    (bh->b_data + (dev_bytenr & 4095));
+
+	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
+	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+	    btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
+		brelse(bh);
+		return 0;
+	}
+
+	superblock_tmp =
+	    btrfsic_block_hashtable_lookup(superblock_bdev,
+					   dev_bytenr,
+					   &state->block_hashtable);
+	if (NULL == superblock_tmp) {
+		superblock_tmp = btrfsic_block_alloc();
+		if (NULL == superblock_tmp) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			brelse(bh);
+			return -1;
+		}
+		/* for superblock, only the dev_bytenr makes sense */
+		superblock_tmp->dev_bytenr = dev_bytenr;
+		superblock_tmp->dev_state = dev_state;
+		superblock_tmp->logical_bytenr = dev_bytenr;
+		superblock_tmp->generation = btrfs_super_generation(super_tmp);
+		superblock_tmp->is_metadata = 1;
+		superblock_tmp->is_superblock = 1;
+		superblock_tmp->is_iodone = 1;
+		superblock_tmp->never_written = 0;
+		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
+				     " @%llu (%s/%llu/%d)\n",
+				     superblock_bdev,
+				     rcu_str_deref(device->name), dev_bytenr,
+				     dev_state->name, dev_bytenr,
+				     superblock_mirror_num);
+		list_add(&superblock_tmp->all_blocks_node,
+			 &state->all_blocks_list);
+		btrfsic_block_hashtable_add(superblock_tmp,
+					    &state->block_hashtable);
+	}
+
+	/* select the one with the highest generation field */
+	if (btrfs_super_generation(super_tmp) >
+	    state->max_superblock_generation ||
+	    0 == state->max_superblock_generation) {
+		memcpy(selected_super, super_tmp, sizeof(*selected_super));
+		*selected_dev_state = dev_state;
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_tmp);
+		state->latest_superblock = superblock_tmp;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		u64 next_bytenr;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key;
+
+		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_disk_key.offset = 0;
+		switch (pass) {
+		case 0:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "initial root ";
+			next_bytenr = btrfs_super_root(super_tmp);
+			break;
+		case 1:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "initial chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_tmp);
+			break;
+		case 2:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "initial log ";
+			next_bytenr = btrfs_super_log_root(super_tmp);
+			if (0 == next_bytenr)
+				continue;
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->metablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block *next_block;
+			struct btrfsic_block_data_ctx tmp_next_block_ctx;
+			struct btrfsic_block_link *l;
+
+			if (btrfsic_map_block(state, next_bytenr,
+					      state->metablock_size,
+					      &tmp_next_block_ctx,
+					      mirror_num)) {
+				printk(KERN_INFO "btrfsic: btrfsic_map_block("
+				       "bytenr @%llu, mirror %d) failed!\n",
+				       next_bytenr, mirror_num);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					additional_string, 1, 1, 0,
+					mirror_num, NULL);
+			if (NULL == next_block) {
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				brelse(bh);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state, &tmp_next_block_ctx,
+					next_block, superblock_tmp,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l) {
+				brelse(bh);
+				return -1;
+			}
+		}
+	}
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+	brelse(bh);
+	return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+	struct btrfsic_stack_frame *sf;
+
+	sf = kzalloc(sizeof(*sf), GFP_NOFS);
+	if (NULL == sf)
+		printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+	else
+		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+	return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+	BUG_ON(!(NULL == sf ||
+		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+	kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const first_block,
+		struct btrfsic_block_data_ctx *const first_block_ctx,
+		int first_limit_nesting, int force_iodone_flag)
+{
+	struct btrfsic_stack_frame initial_stack_frame = { 0 };
+	struct btrfsic_stack_frame *sf;
+	struct btrfsic_stack_frame *next_stack;
+	struct btrfs_header *const first_hdr =
+		(struct btrfs_header *)first_block_ctx->datav[0];
+
+	BUG_ON(!first_hdr);
+	sf = &initial_stack_frame;
+	sf->error = 0;
+	sf->i = -1;
+	sf->limit_nesting = first_limit_nesting;
+	sf->block = first_block;
+	sf->block_ctx = first_block_ctx;
+	sf->next_block = NULL;
+	sf->hdr = first_hdr;
+	sf->prev = NULL;
+
+continue_with_new_stack_frame:
+	sf->block->generation = le64_to_cpu(sf->hdr->generation);
+	if (0 == sf->hdr->level) {
+		struct btrfs_leaf *const leafhdr =
+		    (struct btrfs_leaf *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "leaf %llu items %d generation %llu"
+				       " owner %llu\n",
+				       sf->block_ctx->start, sf->nr,
+				       btrfs_stack_header_generation(
+					       &leafhdr->header),
+				       btrfs_stack_header_owner(
+					       &leafhdr->header));
+		}
+
+continue_with_current_leaf_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_item disk_item;
+			u32 disk_item_offset =
+				(uintptr_t)(leafhdr->items + sf->i) -
+				(uintptr_t)leafhdr;
+			struct btrfs_disk_key *disk_key;
+			u8 type;
+			u32 item_offset;
+			u32 item_size;
+
+			if (disk_item_offset + sizeof(struct btrfs_item) >
+			    sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+				printk(KERN_INFO
+				       "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+				       sf->block_ctx->start,
+				       sf->block_ctx->dev->name);
+				goto one_stack_frame_backwards;
+			}
+			btrfsic_read_from_block_data(sf->block_ctx,
+						     &disk_item,
+						     disk_item_offset,
+						     sizeof(struct btrfs_item));
+			item_offset = btrfs_stack_item_offset(&disk_item);
+			item_size = btrfs_stack_item_size(&disk_item);
+			disk_key = &disk_item.key;
+			type = btrfs_disk_key_type(disk_key);
+
+			if (BTRFS_ROOT_ITEM_KEY == type) {
+				struct btrfs_root_item root_item;
+				u32 root_item_offset;
+				u64 next_bytenr;
+
+				root_item_offset = item_offset +
+					offsetof(struct btrfs_leaf, items);
+				if (root_item_offset + item_size >
+				    sf->block_ctx->len)
+					goto leaf_item_out_of_bounce_error;
+				btrfsic_read_from_block_data(
+					sf->block_ctx, &root_item,
+					root_item_offset,
+					item_size);
+				next_bytenr = btrfs_root_bytenr(&root_item);
+
+				sf->error =
+				    btrfsic_create_link_to_next_block(
+						state,
+						sf->block,
+						sf->block_ctx,
+						next_bytenr,
+						sf->limit_nesting,
+						&sf->next_block_ctx,
+						&sf->next_block,
+						force_iodone_flag,
+						&sf->num_copies,
+						&sf->mirror_num,
+						disk_key,
+						btrfs_root_generation(
+						&root_item));
+				if (sf->error)
+					goto one_stack_frame_backwards;
+
+				if (NULL != sf->next_block) {
+					struct btrfs_header *const next_hdr =
+					    (struct btrfs_header *)
+					    sf->next_block_ctx.datav[0];
+
+					next_stack =
+					    btrfsic_stack_frame_alloc();
+					if (NULL == next_stack) {
+						sf->error = -1;
+						btrfsic_release_block_ctx(
+								&sf->
+								next_block_ctx);
+						goto one_stack_frame_backwards;
+					}
+
+					next_stack->i = -1;
+					next_stack->block = sf->next_block;
+					next_stack->block_ctx =
+					    &sf->next_block_ctx;
+					next_stack->next_block = NULL;
+					next_stack->hdr = next_hdr;
+					next_stack->limit_nesting =
+					    sf->limit_nesting - 1;
+					next_stack->prev = sf;
+					sf = next_stack;
+					goto continue_with_new_stack_frame;
+				}
+			} else if (BTRFS_EXTENT_DATA_KEY == type &&
+				   state->include_extent_data) {
+				sf->error = btrfsic_handle_extent_data(
+						state,
+						sf->block,
+						sf->block_ctx,
+						item_offset,
+						force_iodone_flag);
+				if (sf->error)
+					goto one_stack_frame_backwards;
+			}
+
+			goto continue_with_current_leaf_stack_frame;
+		}
+	} else {
+		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+		if (-1 == sf->i) {
+			sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "node %llu level %d items %d"
+				       " generation %llu owner %llu\n",
+				       sf->block_ctx->start,
+				       nodehdr->header.level, sf->nr,
+				       btrfs_stack_header_generation(
+				       &nodehdr->header),
+				       btrfs_stack_header_owner(
+				       &nodehdr->header));
+		}
+
+continue_with_current_node_stack_frame:
+		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+			sf->i++;
+			sf->num_copies = 0;
+		}
+
+		if (sf->i < sf->nr) {
+			struct btrfs_key_ptr key_ptr;
+			u32 key_ptr_offset;
+			u64 next_bytenr;
+
+			key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+					  (uintptr_t)nodehdr;
+			if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+			    sf->block_ctx->len) {
+				printk(KERN_INFO
+				       "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+				       sf->block_ctx->start,
+				       sf->block_ctx->dev->name);
+				goto one_stack_frame_backwards;
+			}
+			btrfsic_read_from_block_data(
+				sf->block_ctx, &key_ptr, key_ptr_offset,
+				sizeof(struct btrfs_key_ptr));
+			next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
+
+			sf->error = btrfsic_create_link_to_next_block(
+					state,
+					sf->block,
+					sf->block_ctx,
+					next_bytenr,
+					sf->limit_nesting,
+					&sf->next_block_ctx,
+					&sf->next_block,
+					force_iodone_flag,
+					&sf->num_copies,
+					&sf->mirror_num,
+					&key_ptr.key,
+					btrfs_stack_key_generation(&key_ptr));
+			if (sf->error)
+				goto one_stack_frame_backwards;
+
+			if (NULL != sf->next_block) {
+				struct btrfs_header *const next_hdr =
+				    (struct btrfs_header *)
+				    sf->next_block_ctx.datav[0];
+
+				next_stack = btrfsic_stack_frame_alloc();
+				if (NULL == next_stack) {
+					sf->error = -1;
+					goto one_stack_frame_backwards;
+				}
+
+				next_stack->i = -1;
+				next_stack->block = sf->next_block;
+				next_stack->block_ctx = &sf->next_block_ctx;
+				next_stack->next_block = NULL;
+				next_stack->hdr = next_hdr;
+				next_stack->limit_nesting =
+				    sf->limit_nesting - 1;
+				next_stack->prev = sf;
+				sf = next_stack;
+				goto continue_with_new_stack_frame;
+			}
+
+			goto continue_with_current_node_stack_frame;
+		}
+	}
+
+one_stack_frame_backwards:
+	if (NULL != sf->prev) {
+		struct btrfsic_stack_frame *const prev = sf->prev;
+
+		/* the one for the initial block is freed in the caller */
+		btrfsic_release_block_ctx(sf->block_ctx);
+
+		if (sf->error) {
+			prev->error = sf->error;
+			btrfsic_stack_frame_free(sf);
+			sf = prev;
+			goto one_stack_frame_backwards;
+		}
+
+		btrfsic_stack_frame_free(sf);
+		sf = prev;
+		goto continue_with_new_stack_frame;
+	} else {
+		BUG_ON(&initial_stack_frame != sf);
+	}
+
+	return sf->error;
+}
+
+static void btrfsic_read_from_block_data(
+	struct btrfsic_block_data_ctx *block_ctx,
+	void *dstv, u32 offset, size_t len)
+{
+	size_t cur;
+	size_t offset_in_page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(offset + len > block_ctx->len);
+	offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+		BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+			    PAGE_CACHE_SHIFT);
+		kaddr = block_ctx->datav[i];
+		memcpy(dst, kaddr + offset_in_page, cur);
+
+		dst += cur;
+		len -= cur;
+		offset_in_page = 0;
+		i++;
+	}
+}
+
+static int btrfsic_create_link_to_next_block(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u64 next_bytenr,
+		int limit_nesting,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block **next_blockp,
+		int force_iodone_flag,
+		int *num_copiesp, int *mirror_nump,
+		struct btrfs_disk_key *disk_key,
+		u64 parent_generation)
+{
+	struct btrfsic_block *next_block = NULL;
+	int ret;
+	struct btrfsic_block_link *l;
+	int did_alloc_block_link;
+	int block_was_created;
+
+	*next_blockp = NULL;
+	if (0 == *num_copiesp) {
+		*num_copiesp =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->metablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, *num_copiesp);
+		*mirror_nump = 1;
+	}
+
+	if (*mirror_nump > *num_copiesp)
+		return 0;
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+		printk(KERN_INFO
+		       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+		       *mirror_nump);
+	ret = btrfsic_map_block(state, next_bytenr,
+				state->metablock_size,
+				next_block_ctx, *mirror_nump);
+	if (ret) {
+		printk(KERN_INFO
+		       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+		       next_bytenr, *mirror_nump);
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+
+	next_block = btrfsic_block_lookup_or_add(state,
+						 next_block_ctx, "referenced ",
+						 1, force_iodone_flag,
+						 !force_iodone_flag,
+						 *mirror_nump,
+						 &block_was_created);
+	if (NULL == next_block) {
+		btrfsic_release_block_ctx(next_block_ctx);
+		*next_blockp = NULL;
+		return -1;
+	}
+	if (block_was_created) {
+		l = NULL;
+		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+	} else {
+		if (next_block->logical_bytenr != next_bytenr &&
+		    !(!next_block->is_metadata &&
+		      0 == next_block->logical_bytenr)) {
+			printk(KERN_INFO
+			       "Referenced block @%llu (%s/%llu/%d)"
+			       " found in hash table, %c,"
+			       " bytenr mismatch (!= stored %llu).\n",
+			       next_bytenr, next_block_ctx->dev->name,
+			       next_block_ctx->dev_bytenr, *mirror_nump,
+			       btrfsic_get_block_type(state, next_block),
+			       next_block->logical_bytenr);
+		} else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "Referenced block @%llu (%s/%llu/%d)"
+			       " found in hash table, %c.\n",
+			       next_bytenr, next_block_ctx->dev->name,
+			       next_block_ctx->dev_bytenr, *mirror_nump,
+			       btrfsic_get_block_type(state, next_block));
+		next_block->logical_bytenr = next_bytenr;
+
+		next_block->mirror_num = *mirror_nump;
+		l = btrfsic_block_link_hashtable_lookup(
+				next_block_ctx->dev->bdev,
+				next_block_ctx->dev_bytenr,
+				block_ctx->dev->bdev,
+				block_ctx->dev_bytenr,
+				&state->block_link_hashtable);
+	}
+
+	next_block->disk_key = *disk_key;
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		did_alloc_block_link = 1;
+		l->block_ref_to = next_block;
+		l->block_ref_from = block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		did_alloc_block_link = 0;
+		if (0 == limit_nesting) {
+			l->ref_cnt++;
+			l->parent_generation = parent_generation;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_add_link(state, l);
+		}
+	}
+
+	if (limit_nesting > 0 && did_alloc_block_link) {
+		ret = btrfsic_read_block(state, next_block_ctx);
+		if (ret < (int)next_block_ctx->len) {
+			printk(KERN_INFO
+			       "btrfsic: read block @logical %llu failed!\n",
+			       next_bytenr);
+			btrfsic_release_block_ctx(next_block_ctx);
+			*next_blockp = NULL;
+			return -1;
+		}
+
+		*next_blockp = next_block;
+	} else {
+		*next_blockp = NULL;
+	}
+	(*mirror_nump)++;
+
+	return 0;
+}
+
+static int btrfsic_handle_extent_data(
+		struct btrfsic_state *state,
+		struct btrfsic_block *block,
+		struct btrfsic_block_data_ctx *block_ctx,
+		u32 item_offset, int force_iodone_flag)
+{
+	int ret;
+	struct btrfs_file_extent_item file_extent_item;
+	u64 file_extent_item_offset;
+	u64 next_bytenr;
+	u64 num_bytes;
+	u64 generation;
+	struct btrfsic_block_link *l;
+
+	file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+				  item_offset;
+	if (file_extent_item_offset +
+	    offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+	    block_ctx->len) {
+		printk(KERN_INFO
+		       "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+		       block_ctx->start, block_ctx->dev->name);
+		return -1;
+	}
+
+	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+		file_extent_item_offset,
+		offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+	if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+	    btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+			printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+			       file_extent_item.type,
+			       btrfs_stack_file_extent_disk_bytenr(
+			       &file_extent_item));
+		return 0;
+	}
+
+	if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+	    block_ctx->len) {
+		printk(KERN_INFO
+		       "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+		       block_ctx->start, block_ctx->dev->name);
+		return -1;
+	}
+	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+				     file_extent_item_offset,
+				     sizeof(struct btrfs_file_extent_item));
+	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
+	if (btrfs_stack_file_extent_compression(&file_extent_item) ==
+	    BTRFS_COMPRESS_NONE) {
+		next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+		num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+	} else {
+		num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+	}
+	generation = btrfs_stack_file_extent_generation(&file_extent_item);
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+		       " offset = %llu, num_bytes = %llu\n",
+		       file_extent_item.type,
+		       btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
+		       btrfs_stack_file_extent_offset(&file_extent_item),
+		       num_bytes);
+	while (num_bytes > 0) {
+		u32 chunk_len;
+		int num_copies;
+		int mirror_num;
+
+		if (num_bytes > state->datablock_size)
+			chunk_len = state->datablock_size;
+		else
+			chunk_len = num_bytes;
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, state->datablock_size);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			struct btrfsic_block_data_ctx next_block_ctx;
+			struct btrfsic_block *next_block;
+			int block_was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "btrfsic_handle_extent_data("
+				       "mirror_num=%d)\n", mirror_num);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+				printk(KERN_INFO
+				       "\tdisk_bytenr = %llu, num_bytes %u\n",
+				       next_bytenr, chunk_len);
+			ret = btrfsic_map_block(state, next_bytenr,
+						chunk_len, &next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       next_bytenr, mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&next_block_ctx,
+					"referenced ",
+					0,
+					force_iodone_flag,
+					!force_iodone_flag,
+					mirror_num,
+					&block_was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&next_block_ctx);
+				return -1;
+			}
+			if (!block_was_created) {
+				if (next_block->logical_bytenr != next_bytenr &&
+				    !(!next_block->is_metadata &&
+				      0 == next_block->logical_bytenr)) {
+					printk(KERN_INFO
+					       "Referenced block"
+					       " @%llu (%s/%llu/%d)"
+					       " found in hash table, D,"
+					       " bytenr mismatch"
+					       " (!= stored %llu).\n",
+					       next_bytenr,
+					       next_block_ctx.dev->name,
+					       next_block_ctx.dev_bytenr,
+					       mirror_num,
+					       next_block->logical_bytenr);
+				}
+				next_block->logical_bytenr = next_bytenr;
+				next_block->mirror_num = mirror_num;
+			}
+
+			l = btrfsic_block_link_lookup_or_add(state,
+							     &next_block_ctx,
+							     next_block, block,
+							     generation);
+			btrfsic_release_block_ctx(&next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+
+		next_bytenr += chunk_len;
+		num_bytes -= chunk_len;
+	}
+
+	return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+			     struct btrfsic_block_data_ctx *block_ctx_out,
+			     int mirror_num)
+{
+	int ret;
+	u64 length;
+	struct btrfs_bio *multi = NULL;
+	struct btrfs_device *device;
+
+	length = len;
+	ret = btrfs_map_block(state->root->fs_info, READ,
+			      bytenr, &length, &multi, mirror_num);
+
+	if (ret) {
+		block_ctx_out->start = 0;
+		block_ctx_out->dev_bytenr = 0;
+		block_ctx_out->len = 0;
+		block_ctx_out->dev = NULL;
+		block_ctx_out->datav = NULL;
+		block_ctx_out->pagev = NULL;
+		block_ctx_out->mem_to_free = NULL;
+
+		return ret;
+	}
+
+	device = multi->stripes[0].dev;
+	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+	block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->datav = NULL;
+	block_ctx_out->pagev = NULL;
+	block_ctx_out->mem_to_free = NULL;
+
+	kfree(multi);
+	if (NULL == block_ctx_out->dev) {
+		ret = -ENXIO;
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+	}
+
+	return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+				  u32 len, struct block_device *bdev,
+				  struct btrfsic_block_data_ctx *block_ctx_out)
+{
+	block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+	block_ctx_out->dev_bytenr = bytenr;
+	block_ctx_out->start = bytenr;
+	block_ctx_out->len = len;
+	block_ctx_out->datav = NULL;
+	block_ctx_out->pagev = NULL;
+	block_ctx_out->mem_to_free = NULL;
+	if (NULL != block_ctx_out->dev) {
+		return 0;
+	} else {
+		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+		return -ENXIO;
+	}
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+	if (block_ctx->mem_to_free) {
+		unsigned int num_pages;
+
+		BUG_ON(!block_ctx->datav);
+		BUG_ON(!block_ctx->pagev);
+		num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+			    PAGE_CACHE_SHIFT;
+		while (num_pages > 0) {
+			num_pages--;
+			if (block_ctx->datav[num_pages]) {
+				kunmap(block_ctx->pagev[num_pages]);
+				block_ctx->datav[num_pages] = NULL;
+			}
+			if (block_ctx->pagev[num_pages]) {
+				__free_page(block_ctx->pagev[num_pages]);
+				block_ctx->pagev[num_pages] = NULL;
+			}
+		}
+
+		kfree(block_ctx->mem_to_free);
+		block_ctx->mem_to_free = NULL;
+		block_ctx->pagev = NULL;
+		block_ctx->datav = NULL;
+	}
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+			      struct btrfsic_block_data_ctx *block_ctx)
+{
+	unsigned int num_pages;
+	unsigned int i;
+	u64 dev_bytenr;
+	int ret;
+
+	BUG_ON(block_ctx->datav);
+	BUG_ON(block_ctx->pagev);
+	BUG_ON(block_ctx->mem_to_free);
+	if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: read_block() with unaligned bytenr %llu\n",
+		       block_ctx->dev_bytenr);
+		return -1;
+	}
+
+	num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+		    PAGE_CACHE_SHIFT;
+	block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+					  sizeof(*block_ctx->pagev)) *
+					 num_pages, GFP_NOFS);
+	if (!block_ctx->mem_to_free)
+		return -1;
+	block_ctx->datav = block_ctx->mem_to_free;
+	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+	for (i = 0; i < num_pages; i++) {
+		block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+		if (!block_ctx->pagev[i])
+			return -1;
+	}
+
+	dev_bytenr = block_ctx->dev_bytenr;
+	for (i = 0; i < num_pages;) {
+		struct bio *bio;
+		unsigned int j;
+
+		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
+		if (!bio) {
+			printk(KERN_INFO
+			       "btrfsic: bio_alloc() for %u pages failed!\n",
+			       num_pages - i);
+			return -1;
+		}
+		bio->bi_bdev = block_ctx->dev->bdev;
+		bio->bi_iter.bi_sector = dev_bytenr >> 9;
+
+		for (j = i; j < num_pages; j++) {
+			ret = bio_add_page(bio, block_ctx->pagev[j],
+					   PAGE_CACHE_SIZE, 0);
+			if (PAGE_CACHE_SIZE != ret)
+				break;
+		}
+		if (j == i) {
+			printk(KERN_INFO
+			       "btrfsic: error, failed to add a single page!\n");
+			return -1;
+		}
+		if (submit_bio_wait(READ, bio)) {
+			printk(KERN_INFO
+			       "btrfsic: read error at logical %llu dev %s!\n",
+			       block_ctx->start, block_ctx->dev->name);
+			bio_put(bio);
+			return -1;
+		}
+		bio_put(bio);
+		dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+		i = j;
+	}
+	for (i = 0; i < num_pages; i++) {
+		block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+		if (!block_ctx->datav[i]) {
+			printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+			       block_ctx->dev->name);
+			return -1;
+		}
+	}
+
+	return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+	struct list_head *elem_all;
+
+	BUG_ON(NULL == state);
+
+	printk(KERN_INFO "all_blocks_list:\n");
+	list_for_each(elem_all, &state->all_blocks_list) {
+		const struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *elem_ref_from;
+
+		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+		       btrfsic_get_block_type(state, b_all),
+		       b_all->logical_bytenr, b_all->dev_state->name,
+		       b_all->dev_bytenr, b_all->mirror_num);
+
+		list_for_each(elem_ref_to, &b_all->ref_to_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " refers %u* to"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		}
+
+		list_for_each(elem_ref_from, &b_all->ref_from_list) {
+			const struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_from,
+				       struct btrfsic_block_link,
+				       node_ref_from);
+
+			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from"
+			       " %c @%llu (%s/%llu/%d)\n",
+			       btrfsic_get_block_type(state, b_all),
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		}
+
+		printk(KERN_INFO "\n");
+	}
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+				     char **datav, unsigned int num_pages)
+{
+	struct btrfs_header *h;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	unsigned int i;
+
+	if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+		return 1; /* not metadata */
+	num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+	h = (struct btrfs_header *)datav[0];
+
+	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+		return 1;
+
+	for (i = 0; i < num_pages; i++) {
+		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+		size_t sublen = i ? PAGE_CACHE_SIZE :
+				    (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+
+		crc = btrfs_crc32c(crc, data, sublen);
+	}
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, h->csum, state->csum_size))
+		return 1;
+
+	return 0; /* is metadata */
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+					  u64 dev_bytenr, char **mapped_datav,
+					  unsigned int num_pages,
+					  struct bio *bio, int *bio_is_patched,
+					  struct buffer_head *bh,
+					  int submit_bio_bh_rw)
+{
+	int is_metadata;
+	struct btrfsic_block *block;
+	struct btrfsic_block_data_ctx block_ctx;
+	int ret;
+	struct btrfsic_state *state = dev_state->state;
+	struct block_device *bdev = dev_state->bdev;
+	unsigned int processed_len;
+
+	if (NULL != bio_is_patched)
+		*bio_is_patched = 0;
+
+again:
+	if (num_pages == 0)
+		return;
+
+	processed_len = 0;
+	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+						      num_pages));
+
+	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL != block) {
+		u64 bytenr = 0;
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		if (block->is_superblock) {
+			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
+						    mapped_datav[0]);
+			if (num_pages * PAGE_CACHE_SIZE <
+			    BTRFS_SUPER_INFO_SIZE) {
+				printk(KERN_INFO
+				       "btrfsic: cannot work with too short bios!\n");
+				return;
+			}
+			is_metadata = 1;
+			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+			processed_len = BTRFS_SUPER_INFO_SIZE;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+				printk(KERN_INFO
+				       "[before new superblock is written]:\n");
+				btrfsic_dump_tree_sub(state, block, 0);
+			}
+		}
+		if (is_metadata) {
+			if (!block->is_superblock) {
+				if (num_pages * PAGE_CACHE_SIZE <
+				    state->metablock_size) {
+					printk(KERN_INFO
+					       "btrfsic: cannot work with too short bios!\n");
+					return;
+				}
+				processed_len = state->metablock_size;
+				bytenr = btrfs_stack_header_bytenr(
+						(struct btrfs_header *)
+						mapped_datav[0]);
+				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+							       dev_state,
+							       dev_bytenr);
+			}
+			if (block->logical_bytenr != bytenr &&
+			    !(!block->is_metadata &&
+			      block->logical_bytenr == 0))
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c,"
+				       " bytenr mismatch"
+				       " (!= stored %llu).\n",
+				       bytenr, dev_state->name, dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block),
+				       block->logical_bytenr);
+			else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       bytenr, dev_state->name, dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+			block->logical_bytenr = bytenr;
+		} else {
+			if (num_pages * PAGE_CACHE_SIZE <
+			    state->datablock_size) {
+				printk(KERN_INFO
+				       "btrfsic: cannot work with too short bios!\n");
+				return;
+			}
+			processed_len = state->datablock_size;
+			bytenr = block->logical_bytenr;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/%d)"
+				       " found in hash table, %c.\n",
+				       bytenr, dev_state->name, dev_bytenr,
+				       block->mirror_num,
+				       btrfsic_get_block_type(state, block));
+		}
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "ref_to_list: %cE, ref_from_list: %cE\n",
+			       list_empty(&block->ref_to_list) ? ' ' : '!',
+			       list_empty(&block->ref_from_list) ? ' ' : '!');
+		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), old(gen=%llu,"
+			       " objectid=%llu, type=%d, offset=%llu),"
+			       " new(gen=%llu),"
+			       " which is referenced by most recent superblock"
+			       " (superblockgen=%llu)!\n",
+			       btrfsic_get_block_type(state, block), bytenr,
+			       dev_state->name, dev_bytenr, block->mirror_num,
+			       block->generation,
+			       btrfs_disk_key_objectid(&block->disk_key),
+			       block->disk_key.type,
+			       btrfs_disk_key_offset(&block->disk_key),
+			       btrfs_stack_header_generation(
+				       (struct btrfs_header *) mapped_datav[0]),
+			       state->max_superblock_generation);
+			btrfsic_dump_tree(state);
+		}
+
+		if (!block->is_iodone && !block->never_written) {
+			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+			       " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, block), bytenr,
+			       dev_state->name, dev_bytenr, block->mirror_num,
+			       block->generation,
+			       btrfs_stack_header_generation(
+				       (struct btrfs_header *)
+				       mapped_datav[0]));
+			/* it would not be safe to go on */
+			btrfsic_dump_tree(state);
+			goto continue_loop;
+		}
+
+		/*
+		 * Clear all references of this block. Do not free
+		 * the block itself even if is not referenced anymore
+		 * because it still carries valueable information
+		 * like whether it was ever written and IO completed.
+		 */
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &block->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+			l->ref_cnt--;
+			if (0 == l->ref_cnt) {
+				list_del(&l->node_ref_to);
+				list_del(&l->node_ref_from);
+				btrfsic_block_link_hashtable_remove(l);
+				btrfsic_block_link_free(l);
+			}
+		}
+
+		if (block->is_superblock)
+			ret = btrfsic_map_superblock(state, bytenr,
+						     processed_len,
+						     bdev, &block_ctx);
+		else
+			ret = btrfsic_map_block(state, bytenr, processed_len,
+						&block_ctx, 0);
+		if (ret) {
+			printk(KERN_INFO
+			       "btrfsic: btrfsic_map_block(root @%llu)"
+			       " failed!\n", bytenr);
+			goto continue_loop;
+		}
+		block_ctx.datav = mapped_datav;
+		/* the following is required in case of writes to mirrors,
+		 * use the same that was used for the lookup */
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+
+		if (is_metadata || state->include_extent_data) {
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			if (NULL != bio) {
+				block->is_iodone = 0;
+				BUG_ON(NULL == bio_is_patched);
+				if (!*bio_is_patched) {
+					block->orig_bio_bh_private =
+					    bio->bi_private;
+					block->orig_bio_bh_end_io.bio =
+					    bio->bi_end_io;
+					block->next_in_same_bio = NULL;
+					bio->bi_private = block;
+					bio->bi_end_io = btrfsic_bio_end_io;
+					*bio_is_patched = 1;
+				} else {
+					struct btrfsic_block *chained_block =
+					    (struct btrfsic_block *)
+					    bio->bi_private;
+
+					BUG_ON(NULL == chained_block);
+					block->orig_bio_bh_private =
+					    chained_block->orig_bio_bh_private;
+					block->orig_bio_bh_end_io.bio =
+					    chained_block->orig_bio_bh_end_io.
+					    bio;
+					block->next_in_same_bio = chained_block;
+					bio->bi_private = block;
+				}
+			} else if (NULL != bh) {
+				block->is_iodone = 0;
+				block->orig_bio_bh_private = bh->b_private;
+				block->orig_bio_bh_end_io.bh = bh->b_end_io;
+				block->next_in_same_bio = NULL;
+				bh->b_private = block;
+				bh->b_end_io = btrfsic_bh_end_io;
+			} else {
+				block->is_iodone = 1;
+				block->orig_bio_bh_private = NULL;
+				block->orig_bio_bh_end_io.bio = NULL;
+				block->next_in_same_bio = NULL;
+			}
+		}
+
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (is_metadata) {
+			block->logical_bytenr = bytenr;
+			block->is_metadata = 1;
+			if (block->is_superblock) {
+				BUG_ON(PAGE_CACHE_SIZE !=
+				       BTRFS_SUPER_INFO_SIZE);
+				ret = btrfsic_process_written_superblock(
+						state,
+						block,
+						(struct btrfs_super_block *)
+						mapped_datav[0]);
+				if (state->print_mask &
+				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+					printk(KERN_INFO
+					"[after new superblock is written]:\n");
+					btrfsic_dump_tree_sub(state, block, 0);
+				}
+			} else {
+				block->mirror_num = 0;	/* unknown */
+				ret = btrfsic_process_metablock(
+						state,
+						block,
+						&block_ctx,
+						0, 0);
+			}
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_process_metablock"
+				       "(root @%llu) failed!\n",
+				       dev_bytenr);
+		} else {
+			block->is_metadata = 0;
+			block->mirror_num = 0;	/* unknown */
+			block->generation = BTRFSIC_GENERATION_UNKNOWN;
+			if (!state->include_extent_data
+			    && list_empty(&block->ref_from_list)) {
+				/*
+				 * disk block is overwritten with extent
+				 * data (not meta data) and we are configured
+				 * to not include extent data: take the
+				 * chance and free the block's memory
+				 */
+				btrfsic_block_hashtable_remove(block);
+				list_del(&block->all_blocks_node);
+				btrfsic_block_free(block);
+			}
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	} else {
+		/* block has not been found in hash table */
+		u64 bytenr;
+
+		if (!is_metadata) {
+			processed_len = state->datablock_size;
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO "Written block (%s/%llu/?)"
+				       " !found in hash table, D.\n",
+				       dev_state->name, dev_bytenr);
+			if (!state->include_extent_data) {
+				/* ignore that written D block */
+				goto continue_loop;
+			}
+
+			/* this is getting ugly for the
+			 * include_extent_data case... */
+			bytenr = 0;	/* unknown */
+			block_ctx.start = bytenr;
+			block_ctx.len = processed_len;
+			block_ctx.mem_to_free = NULL;
+			block_ctx.pagev = NULL;
+		} else {
+			processed_len = state->metablock_size;
+			bytenr = btrfs_stack_header_bytenr(
+					(struct btrfs_header *)
+					mapped_datav[0]);
+			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+						       dev_bytenr);
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "Written block @%llu (%s/%llu/?)"
+				       " !found in hash table, M.\n",
+				       bytenr, dev_state->name, dev_bytenr);
+
+			ret = btrfsic_map_block(state, bytenr, processed_len,
+						&block_ctx, 0);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(root @%llu)"
+				       " failed!\n",
+				       dev_bytenr);
+				goto continue_loop;
+			}
+		}
+		block_ctx.datav = mapped_datav;
+		/* the following is required in case of writes to mirrors,
+		 * use the same that was used for the lookup */
+		block_ctx.dev = dev_state;
+		block_ctx.dev_bytenr = dev_bytenr;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			btrfsic_release_block_ctx(&block_ctx);
+			goto continue_loop;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = dev_bytenr;
+		block->logical_bytenr = bytenr;
+		block->is_metadata = is_metadata;
+		block->never_written = 0;
+		block->iodone_w_error = 0;
+		block->mirror_num = 0;	/* unknown */
+		block->flush_gen = dev_state->last_flush_gen + 1;
+		block->submit_bio_bh_rw = submit_bio_bh_rw;
+		if (NULL != bio) {
+			block->is_iodone = 0;
+			BUG_ON(NULL == bio_is_patched);
+			if (!*bio_is_patched) {
+				block->orig_bio_bh_private = bio->bi_private;
+				block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+				block->next_in_same_bio = NULL;
+				bio->bi_private = block;
+				bio->bi_end_io = btrfsic_bio_end_io;
+				*bio_is_patched = 1;
+			} else {
+				struct btrfsic_block *chained_block =
+				    (struct btrfsic_block *)
+				    bio->bi_private;
+
+				BUG_ON(NULL == chained_block);
+				block->orig_bio_bh_private =
+				    chained_block->orig_bio_bh_private;
+				block->orig_bio_bh_end_io.bio =
+				    chained_block->orig_bio_bh_end_io.bio;
+				block->next_in_same_bio = chained_block;
+				bio->bi_private = block;
+			}
+		} else if (NULL != bh) {
+			block->is_iodone = 0;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		} else {
+			block->is_iodone = 1;
+			block->orig_bio_bh_private = NULL;
+			block->orig_bio_bh_end_io.bio = NULL;
+			block->next_in_same_bio = NULL;
+		}
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New written %c-block @%llu (%s/%llu/%d)\n",
+			       is_metadata ? 'M' : 'D',
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+		if (is_metadata) {
+			ret = btrfsic_process_metablock(state, block,
+							&block_ctx, 0, 0);
+			if (ret)
+				printk(KERN_INFO
+				       "btrfsic: process_metablock(root @%llu)"
+				       " failed!\n",
+				       dev_bytenr);
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+
+continue_loop:
+	BUG_ON(!processed_len);
+	dev_bytenr += processed_len;
+	mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+	num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+	goto again;
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+	int iodone_w_error;
+
+	/* mutex is not held! This is not save if IO is not yet completed
+	 * on umount */
+	iodone_w_error = 0;
+	if (bio_error_status)
+		iodone_w_error = 1;
+
+	BUG_ON(NULL == block);
+	bp->bi_private = block->orig_bio_bh_private;
+	bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+	do {
+		struct btrfsic_block *next_block;
+		struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+			       bio_error_status,
+			       btrfsic_get_block_type(dev_state->state, block),
+			       block->logical_bytenr, dev_state->name,
+			       block->dev_bytenr, block->mirror_num);
+		next_block = block->next_in_same_bio;
+		block->iodone_w_error = iodone_w_error;
+		if (block->submit_bio_bh_rw & REQ_FLUSH) {
+			dev_state->last_flush_gen++;
+			if ((dev_state->state->print_mask &
+			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+				printk(KERN_INFO
+				       "bio_end_io() new %s flush_gen=%llu\n",
+				       dev_state->name,
+				       dev_state->last_flush_gen);
+		}
+		if (block->submit_bio_bh_rw & REQ_FUA)
+			block->flush_gen = 0; /* FUA completed means block is
+					       * on disk */
+		block->is_iodone = 1; /* for FLUSH, this releases the block */
+		block = next_block;
+	} while (NULL != block);
+
+	bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+	struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+	int iodone_w_error = !uptodate;
+	struct btrfsic_dev_state *dev_state;
+
+	BUG_ON(NULL == block);
+	dev_state = block->dev_state;
+	if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+		printk(KERN_INFO
+		       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+		       iodone_w_error,
+		       btrfsic_get_block_type(dev_state->state, block),
+		       block->logical_bytenr, block->dev_state->name,
+		       block->dev_bytenr, block->mirror_num);
+
+	block->iodone_w_error = iodone_w_error;
+	if (block->submit_bio_bh_rw & REQ_FLUSH) {
+		dev_state->last_flush_gen++;
+		if ((dev_state->state->print_mask &
+		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+			printk(KERN_INFO
+			       "bh_end_io() new %s flush_gen=%llu\n",
+			       dev_state->name, dev_state->last_flush_gen);
+	}
+	if (block->submit_bio_bh_rw & REQ_FUA)
+		block->flush_gen = 0; /* FUA completed means block is on disk */
+
+	bh->b_private = block->orig_bio_bh_private;
+	bh->b_end_io = block->orig_bio_bh_end_io.bh;
+	block->is_iodone = 1; /* for FLUSH, this releases the block */
+	bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+		struct btrfsic_state *state,
+		struct btrfsic_block *const superblock,
+		struct btrfs_super_block *const super_hdr)
+{
+	int pass;
+
+	superblock->generation = btrfs_super_generation(super_hdr);
+	if (!(superblock->generation > state->max_superblock_generation ||
+	      0 == state->max_superblock_generation)) {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: superblock @%llu (%s/%llu/%d)"
+			       " with old gen %llu <= %llu\n",
+			       superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       superblock->dev_bytenr, superblock->mirror_num,
+			       btrfs_super_generation(super_hdr),
+			       state->max_superblock_generation);
+	} else {
+		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+			printk(KERN_INFO
+			       "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+			       " with new gen %llu > %llu\n",
+			       superblock->logical_bytenr,
+			       superblock->dev_state->name,
+			       superblock->dev_bytenr, superblock->mirror_num,
+			       btrfs_super_generation(super_hdr),
+			       state->max_superblock_generation);
+
+		state->max_superblock_generation =
+		    btrfs_super_generation(super_hdr);
+		state->latest_superblock = superblock;
+	}
+
+	for (pass = 0; pass < 3; pass++) {
+		int ret;
+		u64 next_bytenr;
+		struct btrfsic_block *next_block;
+		struct btrfsic_block_data_ctx tmp_next_block_ctx;
+		struct btrfsic_block_link *l;
+		int num_copies;
+		int mirror_num;
+		const char *additional_string = NULL;
+		struct btrfs_disk_key tmp_disk_key = {0};
+
+		btrfs_set_disk_key_objectid(&tmp_disk_key,
+					    BTRFS_ROOT_ITEM_KEY);
+		btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
+
+		switch (pass) {
+		case 0:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_ROOT_TREE_OBJECTID);
+			additional_string = "root ";
+			next_bytenr = btrfs_super_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "root@%llu\n", next_bytenr);
+			break;
+		case 1:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_CHUNK_TREE_OBJECTID);
+			additional_string = "chunk ";
+			next_bytenr = btrfs_super_chunk_root(super_hdr);
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "chunk@%llu\n", next_bytenr);
+			break;
+		case 2:
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_TREE_LOG_OBJECTID);
+			additional_string = "log ";
+			next_bytenr = btrfs_super_log_root(super_hdr);
+			if (0 == next_bytenr)
+				continue;
+			if (state->print_mask &
+			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+				printk(KERN_INFO "log@%llu\n", next_bytenr);
+			break;
+		}
+
+		num_copies =
+		    btrfs_num_copies(state->root->fs_info,
+				     next_bytenr, BTRFS_SUPER_INFO_SIZE);
+		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+			       next_bytenr, num_copies);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			int was_created;
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				printk(KERN_INFO
+				       "btrfsic_process_written_superblock("
+				       "mirror_num=%d)\n", mirror_num);
+			ret = btrfsic_map_block(state, next_bytenr,
+						BTRFS_SUPER_INFO_SIZE,
+						&tmp_next_block_ctx,
+						mirror_num);
+			if (ret) {
+				printk(KERN_INFO
+				       "btrfsic: btrfsic_map_block(@%llu,"
+				       " mirror=%d) failed!\n",
+				       next_bytenr, mirror_num);
+				return -1;
+			}
+
+			next_block = btrfsic_block_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					additional_string,
+					1, 0, 1,
+					mirror_num,
+					&was_created);
+			if (NULL == next_block) {
+				printk(KERN_INFO
+				       "btrfsic: error, kmalloc failed!\n");
+				btrfsic_release_block_ctx(&tmp_next_block_ctx);
+				return -1;
+			}
+
+			next_block->disk_key = tmp_disk_key;
+			if (was_created)
+				next_block->generation =
+				    BTRFSIC_GENERATION_UNKNOWN;
+			l = btrfsic_block_link_lookup_or_add(
+					state,
+					&tmp_next_block_ctx,
+					next_block,
+					superblock,
+					BTRFSIC_GENERATION_UNKNOWN);
+			btrfsic_release_block_ctx(&tmp_next_block_ctx);
+			if (NULL == l)
+				return -1;
+		}
+	}
+
+	if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
+		btrfsic_dump_tree(state);
+
+	return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+					struct btrfsic_block *const block,
+					int recursion_level)
+{
+	struct list_head *elem_ref_to;
+	int ret = 0;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/*
+		 * Note that this situation can happen and does not
+		 * indicate an error in regular cases. It happens
+		 * when disk blocks are freed and later reused.
+		 * The check-integrity module is not aware of any
+		 * block free operations, it just recognizes block
+		 * write operations. Therefore it keeps the linkage
+		 * information for a block until a block is
+		 * rewritten. This can temporarily cause incorrect
+		 * and even circular linkage informations. This
+		 * causes no harm unless such blocks are referenced
+		 * by the most recent super block.
+		 */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 1).\n");
+
+		return ret;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack
+	 * space is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " %u* refers to %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+		if (l->block_ref_to->never_written) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is never written!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (!l->block_ref_to->is_iodone) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not yet iodone!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (l->block_ref_to->iodone_w_error) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which has write error!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
+		} else if (l->parent_generation !=
+			   l->block_ref_to->generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->parent_generation &&
+			   BTRFSIC_GENERATION_UNKNOWN !=
+			   l->block_ref_to->generation) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " with generation %llu !="
+			       " parent generation %llu!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num,
+			       l->block_ref_to->generation,
+			       l->parent_generation);
+			ret = -1;
+		} else if (l->block_ref_to->flush_gen >
+			   l->block_ref_to->dev_state->last_flush_gen) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which is not flushed out of disk's write cache"
+			       " (block flush_gen=%llu,"
+			       " dev->flush_gen=%llu)!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num, block->flush_gen,
+			       l->block_ref_to->dev_state->last_flush_gen);
+			ret = -1;
+		} else if (-1 == btrfsic_check_all_ref_blocks(state,
+							      l->block_ref_to,
+							      recursion_level +
+							      1)) {
+			ret = -1;
+		}
+	}
+
+	return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+		const struct btrfsic_state *state,
+		const struct btrfsic_block *block,
+		int recursion_level)
+{
+	struct list_head *elem_ref_from;
+
+	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+		/* refer to comment at "abort cyclic linkage (case 1)" */
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "btrfsic: abort cyclic linkage (case 2).\n");
+
+		return 0;
+	}
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	list_for_each(elem_ref_from, &block->ref_from_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_from, struct btrfsic_block_link,
+			       node_ref_from);
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "rl=%d, %c @%llu (%s/%llu/%d)"
+			       " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+			       recursion_level,
+			       btrfsic_get_block_type(state, block),
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num,
+			       l->ref_cnt,
+			       btrfsic_get_block_type(state, l->block_ref_from),
+			       l->block_ref_from->logical_bytenr,
+			       l->block_ref_from->dev_state->name,
+			       l->block_ref_from->dev_bytenr,
+			       l->block_ref_from->mirror_num);
+		if (l->block_ref_from->is_superblock &&
+		    state->latest_superblock->dev_bytenr ==
+		    l->block_ref_from->dev_bytenr &&
+		    state->latest_superblock->dev_state->bdev ==
+		    l->block_ref_from->dev_state->bdev)
+			return 1;
+		else if (btrfsic_is_block_ref_by_superblock(state,
+							    l->block_ref_from,
+							    recursion_level +
+							    1))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Add %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+				   const struct btrfsic_block_link *l)
+{
+	printk(KERN_INFO
+	       "Rem %u* link from %c @%llu (%s/%llu/%d)"
+	       " to %c @%llu (%s/%llu/%d).\n",
+	       l->ref_cnt,
+	       btrfsic_get_block_type(state, l->block_ref_from),
+	       l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->dev_state->name,
+	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+	       btrfsic_get_block_type(state, l->block_ref_to),
+	       l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+				   const struct btrfsic_block *block)
+{
+	if (block->is_superblock &&
+	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+		return 'S';
+	else if (block->is_superblock)
+		return 's';
+	else if (block->is_metadata)
+		return 'M';
+	else
+		return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+				  const struct btrfsic_block *block,
+				  int indent_level)
+{
+	struct list_head *elem_ref_to;
+	int indent_add;
+	static char buf[80];
+	int cursor_position;
+
+	/*
+	 * Should better fill an on-stack buffer with a complete line and
+	 * dump it at once when it is time to print a newline character.
+	 */
+
+	/*
+	 * This algorithm is recursive because the amount of used stack space
+	 * is very small and the max recursion depth is limited.
+	 */
+	indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+			     btrfsic_get_block_type(state, block),
+			     block->logical_bytenr, block->dev_state->name,
+			     block->dev_bytenr, block->mirror_num);
+	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+		printk("[...]\n");
+		return;
+	}
+	printk(buf);
+	indent_level += indent_add;
+	if (list_empty(&block->ref_to_list)) {
+		printk("\n");
+		return;
+	}
+	if (block->mirror_num > 1 &&
+	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+		printk(" [...]\n");
+		return;
+	}
+
+	cursor_position = indent_level;
+	list_for_each(elem_ref_to, &block->ref_to_list) {
+		const struct btrfsic_block_link *const l =
+		    list_entry(elem_ref_to, struct btrfsic_block_link,
+			       node_ref_to);
+
+		while (cursor_position < indent_level) {
+			printk(" ");
+			cursor_position++;
+		}
+		if (l->ref_cnt > 1)
+			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+		else
+			indent_add = sprintf(buf, " --> ");
+		if (indent_level + indent_add >
+		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+			printk("[...]\n");
+			cursor_position = 0;
+			continue;
+		}
+
+		printk(buf);
+
+		btrfsic_dump_tree_sub(state, l->block_ref_to,
+				      indent_level + indent_add);
+		cursor_position = 0;
+	}
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *next_block_ctx,
+		struct btrfsic_block *next_block,
+		struct btrfsic_block *from_block,
+		u64 parent_generation)
+{
+	struct btrfsic_block_link *l;
+
+	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+						next_block_ctx->dev_bytenr,
+						from_block->dev_state->bdev,
+						from_block->dev_bytenr,
+						&state->block_link_hashtable);
+	if (NULL == l) {
+		l = btrfsic_block_link_alloc();
+		if (NULL == l) {
+			printk(KERN_INFO
+			       "btrfsic: error, kmalloc" " failed!\n");
+			return NULL;
+		}
+
+		l->block_ref_to = next_block;
+		l->block_ref_from = from_block;
+		l->ref_cnt = 1;
+		l->parent_generation = parent_generation;
+
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+
+		list_add(&l->node_ref_to, &from_block->ref_to_list);
+		list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+		btrfsic_block_link_hashtable_add(l,
+						 &state->block_link_hashtable);
+	} else {
+		l->ref_cnt++;
+		l->parent_generation = parent_generation;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			btrfsic_print_add_link(state, l);
+	}
+
+	return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+		struct btrfsic_state *state,
+		struct btrfsic_block_data_ctx *block_ctx,
+		const char *additional_string,
+		int is_metadata,
+		int is_iodone,
+		int never_written,
+		int mirror_num,
+		int *was_created)
+{
+	struct btrfsic_block *block;
+
+	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+					       block_ctx->dev_bytenr,
+					       &state->block_hashtable);
+	if (NULL == block) {
+		struct btrfsic_dev_state *dev_state;
+
+		block = btrfsic_block_alloc();
+		if (NULL == block) {
+			printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+			return NULL;
+		}
+		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+		if (NULL == dev_state) {
+			printk(KERN_INFO
+			       "btrfsic: error, lookup dev_state failed!\n");
+			btrfsic_block_free(block);
+			return NULL;
+		}
+		block->dev_state = dev_state;
+		block->dev_bytenr = block_ctx->dev_bytenr;
+		block->logical_bytenr = block_ctx->start;
+		block->is_metadata = is_metadata;
+		block->is_iodone = is_iodone;
+		block->never_written = never_written;
+		block->mirror_num = mirror_num;
+		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+			printk(KERN_INFO
+			       "New %s%c-block @%llu (%s/%llu/%d)\n",
+			       additional_string,
+			       btrfsic_get_block_type(state, block),
+			       block->logical_bytenr, dev_state->name,
+			       block->dev_bytenr, mirror_num);
+		list_add(&block->all_blocks_node, &state->all_blocks_list);
+		btrfsic_block_hashtable_add(block, &state->block_hashtable);
+		if (NULL != was_created)
+			*was_created = 1;
+	} else {
+		if (NULL != was_created)
+			*was_created = 0;
+	}
+
+	return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+					   u64 bytenr,
+					   struct btrfsic_dev_state *dev_state,
+					   u64 dev_bytenr)
+{
+	int num_copies;
+	int mirror_num;
+	int ret;
+	struct btrfsic_block_data_ctx block_ctx;
+	int match = 0;
+
+	num_copies = btrfs_num_copies(state->root->fs_info,
+				      bytenr, state->metablock_size);
+
+	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+		ret = btrfsic_map_block(state, bytenr, state->metablock_size,
+					&block_ctx, mirror_num);
+		if (ret) {
+			printk(KERN_INFO "btrfsic:"
+			       " btrfsic_map_block(logical @%llu,"
+			       " mirror %d) failed!\n",
+			       bytenr, mirror_num);
+			continue;
+		}
+
+		if (dev_state->bdev == block_ctx.dev->bdev &&
+		    dev_bytenr == block_ctx.dev_bytenr) {
+			match++;
+			btrfsic_release_block_ctx(&block_ctx);
+			break;
+		}
+		btrfsic_release_block_ctx(&block_ctx);
+	}
+
+	if (WARN_ON(!match)) {
+		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+		       " phys_bytenr=%llu)!\n",
+		       bytenr, dev_state->name, dev_bytenr);
+		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+			ret = btrfsic_map_block(state, bytenr,
+						state->metablock_size,
+						&block_ctx, mirror_num);
+			if (ret)
+				continue;
+
+			printk(KERN_INFO "Read logical bytenr @%llu maps to"
+			       " (%s/%llu/%d)\n",
+			       bytenr, block_ctx.dev->name,
+			       block_ctx.dev_bytenr, mirror_num);
+		}
+	}
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+		struct block_device *bdev)
+{
+	struct btrfsic_dev_state *ds;
+
+	ds = btrfsic_dev_state_hashtable_lookup(bdev,
+						&btrfsic_dev_state_hashtable);
+	return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized)
+		return submit_bh(rw, bh);
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bh() might also be called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+	/* Only called to write the superblock (incl. FLUSH/FUA) */
+	if (NULL != dev_state &&
+	    (rw & WRITE) && bh->b_size > 0) {
+		u64 dev_bytenr;
+
+		dev_bytenr = 4096 * bh->b_blocknr;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu),"
+			       " size=%zu, data=%p, bdev=%p)\n",
+			       rw, (unsigned long long)bh->b_blocknr,
+			       dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
+		btrfsic_process_written_block(dev_state, dev_bytenr,
+					      &bh->b_data, 1, NULL,
+					      NULL, bh, rw);
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
+			       rw, bh->b_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bh(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bh->b_private;
+			block->orig_bio_bh_end_io.bh = bh->b_end_io;
+			block->next_in_same_bio = NULL;
+			bh->b_private = block;
+			bh->b_end_io = btrfsic_bh_end_io;
+		}
+	}
+	mutex_unlock(&btrfsic_mutex);
+	return submit_bh(rw, bh);
+}
+
+static void __btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	struct btrfsic_dev_state *dev_state;
+
+	if (!btrfsic_is_initialized)
+		return;
+
+	mutex_lock(&btrfsic_mutex);
+	/* since btrfsic_submit_bio() is also called before
+	 * btrfsic_mount(), this might return NULL */
+	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+	if (NULL != dev_state &&
+	    (rw & WRITE) && NULL != bio->bi_io_vec) {
+		unsigned int i;
+		u64 dev_bytenr;
+		u64 cur_bytenr;
+		int bio_is_patched;
+		char **mapped_datav;
+
+		dev_bytenr = 512 * bio->bi_iter.bi_sector;
+		bio_is_patched = 0;
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
+			       " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+			       rw, bio->bi_vcnt,
+			       (unsigned long long)bio->bi_iter.bi_sector,
+			       dev_bytenr, bio->bi_bdev);
+
+		mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+				       GFP_NOFS);
+		if (!mapped_datav)
+			goto leave;
+		cur_bytenr = dev_bytenr;
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+			mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+			if (!mapped_datav[i]) {
+				while (i > 0) {
+					i--;
+					kunmap(bio->bi_io_vec[i].bv_page);
+				}
+				kfree(mapped_datav);
+				goto leave;
+			}
+			if (dev_state->state->print_mask &
+			    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+				printk(KERN_INFO
+				       "#%u: bytenr=%llu, len=%u, offset=%u\n",
+				       i, cur_bytenr, bio->bi_io_vec[i].bv_len,
+				       bio->bi_io_vec[i].bv_offset);
+			cur_bytenr += bio->bi_io_vec[i].bv_len;
+		}
+		btrfsic_process_written_block(dev_state, dev_bytenr,
+					      mapped_datav, bio->bi_vcnt,
+					      bio, &bio_is_patched,
+					      NULL, rw);
+		while (i > 0) {
+			i--;
+			kunmap(bio->bi_io_vec[i].bv_page);
+		}
+		kfree(mapped_datav);
+	} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+		if (dev_state->state->print_mask &
+		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+			printk(KERN_INFO
+			       "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
+			       rw, bio->bi_bdev);
+		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+			if ((dev_state->state->print_mask &
+			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+			      BTRFSIC_PRINT_MASK_VERBOSE)))
+				printk(KERN_INFO
+				       "btrfsic_submit_bio(%s) with FLUSH"
+				       " but dummy block already in use"
+				       " (ignored)!\n",
+				       dev_state->name);
+		} else {
+			struct btrfsic_block *const block =
+				&dev_state->dummy_block_for_bio_bh_flush;
+
+			block->is_iodone = 0;
+			block->never_written = 0;
+			block->iodone_w_error = 0;
+			block->flush_gen = dev_state->last_flush_gen + 1;
+			block->submit_bio_bh_rw = rw;
+			block->orig_bio_bh_private = bio->bi_private;
+			block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+			block->next_in_same_bio = NULL;
+			bio->bi_private = block;
+			bio->bi_end_io = btrfsic_bio_end_io;
+		}
+	}
+leave:
+	mutex_unlock(&btrfsic_mutex);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
+	submit_bio(rw, bio);
+}
+
+int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
+	return submit_bio_wait(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask)
+{
+	int ret;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	if (root->nodesize != root->leafsize) {
+		printk(KERN_INFO
+		       "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+		       root->nodesize, root->leafsize);
+		return -1;
+	}
+	if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+		       root->nodesize, PAGE_CACHE_SIZE);
+		return -1;
+	}
+	if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+		       root->leafsize, PAGE_CACHE_SIZE);
+		return -1;
+	}
+	if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+		printk(KERN_INFO
+		       "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+		       root->sectorsize, PAGE_CACHE_SIZE);
+		return -1;
+	}
+	state = kzalloc(sizeof(*state), GFP_NOFS);
+	if (NULL == state) {
+		printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+		return -1;
+	}
+
+	if (!btrfsic_is_initialized) {
+		mutex_init(&btrfsic_mutex);
+		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+		btrfsic_is_initialized = 1;
+	}
+	mutex_lock(&btrfsic_mutex);
+	state->root = root;
+	state->print_mask = print_mask;
+	state->include_extent_data = including_extent_data;
+	state->csum_size = 0;
+	state->metablock_size = root->nodesize;
+	state->datablock_size = root->sectorsize;
+	INIT_LIST_HEAD(&state->all_blocks_list);
+	btrfsic_block_hashtable_init(&state->block_hashtable);
+	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+	state->max_superblock_generation = 0;
+	state->latest_superblock = NULL;
+
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+		char *p;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_alloc();
+		if (NULL == ds) {
+			printk(KERN_INFO
+			       "btrfs check-integrity: kmalloc() failed!\n");
+			mutex_unlock(&btrfsic_mutex);
+			return -1;
+		}
+		ds->bdev = device->bdev;
+		ds->state = state;
+		bdevname(ds->bdev, ds->name);
+		ds->name[BDEVNAME_SIZE - 1] = '\0';
+		for (p = ds->name; *p != '\0'; p++);
+		while (p > ds->name && *p != '/')
+			p--;
+		if (*p == '/')
+			p++;
+		strlcpy(ds->name, p, sizeof(ds->name));
+		btrfsic_dev_state_hashtable_add(ds,
+						&btrfsic_dev_state_hashtable);
+	}
+
+	ret = btrfsic_process_superblock(state, fs_devices);
+	if (0 != ret) {
+		mutex_unlock(&btrfsic_mutex);
+		btrfsic_unmount(root, fs_devices);
+		return ret;
+	}
+
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+		btrfsic_dump_database(state);
+	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+		btrfsic_dump_tree(state);
+
+	mutex_unlock(&btrfsic_mutex);
+	return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *elem_all;
+	struct list_head *tmp_all;
+	struct btrfsic_state *state;
+	struct list_head *dev_head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+	if (!btrfsic_is_initialized)
+		return;
+
+	mutex_lock(&btrfsic_mutex);
+
+	state = NULL;
+	list_for_each_entry(device, dev_head, dev_list) {
+		struct btrfsic_dev_state *ds;
+
+		if (!device->bdev || !device->name)
+			continue;
+
+		ds = btrfsic_dev_state_hashtable_lookup(
+				device->bdev,
+				&btrfsic_dev_state_hashtable);
+		if (NULL != ds) {
+			state = ds->state;
+			btrfsic_dev_state_hashtable_remove(ds);
+			btrfsic_dev_state_free(ds);
+		}
+	}
+
+	if (NULL == state) {
+		printk(KERN_INFO
+		       "btrfsic: error, cannot find state information"
+		       " on umount!\n");
+		mutex_unlock(&btrfsic_mutex);
+		return;
+	}
+
+	/*
+	 * Don't care about keeping the lists' state up to date,
+	 * just free all memory that was allocated dynamically.
+	 * Free the blocks and the block_links.
+	 */
+	list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+		struct btrfsic_block *const b_all =
+		    list_entry(elem_all, struct btrfsic_block,
+			       all_blocks_node);
+		struct list_head *elem_ref_to;
+		struct list_head *tmp_ref_to;
+
+		list_for_each_safe(elem_ref_to, tmp_ref_to,
+				   &b_all->ref_to_list) {
+			struct btrfsic_block_link *const l =
+			    list_entry(elem_ref_to,
+				       struct btrfsic_block_link,
+				       node_ref_to);
+
+			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+				btrfsic_print_rem_link(state, l);
+
+			l->ref_cnt--;
+			if (0 == l->ref_cnt)
+				btrfsic_block_link_free(l);
+		}
+
+		if (b_all->is_iodone || b_all->never_written)
+			btrfsic_block_free(b_all);
+		else
+			printk(KERN_INFO "btrfs: attempt to free %c-block"
+			       " @%llu (%s/%llu/%d) on umount which is"
+			       " not yet iodone!\n",
+			       btrfsic_get_block_type(state, b_all),
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num);
+	}
+
+	mutex_unlock(&btrfsic_mutex);
+
+	kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 00000000000..13b8566c97a
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+int btrfsic_submit_bio_wait(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#define btrfsic_submit_bio_wait submit_bio_wait
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+		  struct btrfs_fs_devices *fs_devices,
+		  int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+		     struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
deleted file mode 100644
index 7c4503ef6ef..00000000000
--- a/fs/btrfs/compat.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
-
-#define btrfs_drop_nlink(inode) drop_nlink(inode)
-#define btrfs_inc_nlink(inode)	inc_nlink(inode)
-
-#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4d2110eafe2..1daea0b4718 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -82,10 +81,15 @@ struct compressed_bio {
 	u32 sums;
 };
 
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen);
+
 static inline int compressed_bio_size(struct btrfs_root *root,
 				      unsigned long disk_size)
 {
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
 	return sizeof(struct compressed_bio) +
 		((disk_size + root->sectorsize - 1) / root->sectorsize) *
 		csum_size;
@@ -105,7 +109,6 @@ static int check_compressed_csum(struct inode *inode,
 				 u64 disk_start)
 {
 	int ret;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *page;
 	unsigned long i;
 	char *kaddr;
@@ -119,17 +122,16 @@ static int check_compressed_csum(struct inode *inode,
 		page = cb->compressed_pages[i];
 		csum = ~(u32)0;
 
-		kaddr = kmap_atomic(page, KM_USER0);
-		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		kaddr = kmap_atomic(page);
+		csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
 		btrfs_csum_final(csum, (char *)&csum);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 
 		if (csum != *cb_sum) {
-			printk(KERN_INFO "btrfs csum failed ino %lu "
-			       "extent %llu csum %u "
-			       "wanted %u mirror %d\n", inode->i_ino,
-			       (unsigned long long)disk_start,
-			       csum, *cb_sum, cb->mirror_num);
+			btrfs_info(BTRFS_I(inode)->root->fs_info,
+			   "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
+			   btrfs_ino(inode), disk_start, csum, *cb_sum,
+			   cb->mirror_num);
 			ret = -EIO;
 			goto fail;
 		}
@@ -169,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 		goto out;
 
 	inode = cb->inode;
-	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	ret = check_compressed_csum(inode, cb,
+				    (u64)bio->bi_iter.bi_sector << 9);
 	if (ret)
 		goto csum_failed;
 
@@ -198,18 +201,16 @@ csum_failed:
 	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
 	} else {
-		int bio_index = 0;
-		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+		int i;
+		struct bio_vec *bvec;
 
 		/*
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
-		while (bio_index < cb->orig_bio->bi_vcnt) {
+		bio_for_each_segment_all(bvec, cb->orig_bio, i)
 			SetPageChecked(bvec->bv_page);
-			bvec++;
-			bio_index++;
-		}
+
 		bio_endio(cb->orig_bio, 0);
 	}
 
@@ -224,8 +225,8 @@ out:
  * Clear the writeback bits on all of the file
  * pages for a compressed write
  */
-static noinline int end_compressed_writeback(struct inode *inode, u64 start,
-					     unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+					      unsigned long ram_size)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
@@ -251,7 +252,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
 		index += ret;
 	}
 	/* the inode may be gone now */
-	return 0;
 }
 
 /*
@@ -332,14 +332,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	struct compressed_bio *cb;
 	unsigned long bytes_left;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	int page_index = 0;
+	int pg_index = 0;
 	struct page *page;
 	u64 first_byte = disk_start;
 	struct block_device *bdev;
 	int ret;
+	int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		return -ENOMEM;
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
@@ -354,17 +357,21 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	if (!bio) {
+		kfree(cb);
+		return -ENOMEM;
+	}
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
 	atomic_inc(&cb->pending_bios);
 
 	/* create and submit bios for the compressed pages */
 	bytes_left = compressed_len;
-	for (page_index = 0; page_index < cb->nr_pages; page_index++) {
-		page = compressed_pages[page_index];
+	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+		page = compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
-		if (bio->bi_size)
-			ret = io_tree->ops->merge_bio_hook(page, 0,
+		if (bio->bi_iter.bi_size)
+			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
 							   PAGE_CACHE_SIZE,
 							   bio, 0);
 		else
@@ -383,23 +390,28 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			 */
 			atomic_inc(&cb->pending_bios);
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM */
 
-			ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
-			BUG_ON(ret);
+			if (!skip_sum) {
+				ret = btrfs_csum_one_bio(root, inode, bio,
+							 start, 1);
+				BUG_ON(ret); /* -ENOMEM */
+			}
 
 			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM */
 
 			bio_put(bio);
 
 			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			BUG_ON(!bio);
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 		}
 		if (bytes_left < PAGE_CACHE_SIZE) {
-			printk("bytes left %lu compress len %lu nr %lu\n",
+			btrfs_info(BTRFS_I(inode)->root->fs_info,
+					"bytes left %lu compress len %lu nr %lu",
 			       bytes_left, cb->compressed_len, cb->nr_pages);
 		}
 		bytes_left -= PAGE_CACHE_SIZE;
@@ -409,13 +421,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bio_get(bio);
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
-	ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
-	BUG_ON(ret);
+	if (!skip_sum) {
+		ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+		BUG_ON(ret); /* -ENOMEM */
+	}
 
 	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
 	bio_put(bio);
 	return 0;
@@ -426,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 				     struct compressed_bio *cb)
 {
 	unsigned long end_index;
-	unsigned long page_index;
+	unsigned long pg_index;
 	u64 last_offset;
 	u64 isize = i_size_read(inode);
 	int ret;
@@ -450,15 +464,15 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 
 	while (last_offset < compressed_end) {
-		page_index = last_offset >> PAGE_CACHE_SHIFT;
+		pg_index = last_offset >> PAGE_CACHE_SHIFT;
 
-		if (page_index > end_index)
+		if (pg_index > end_index)
 			break;
 
 		rcu_read_lock();
-		page = radix_tree_lookup(&mapping->page_tree, page_index);
+		page = radix_tree_lookup(&mapping->page_tree, pg_index);
 		rcu_read_unlock();
-		if (page) {
+		if (page && !radix_tree_exceptional_entry(page)) {
 			misses++;
 			if (misses > 4)
 				break;
@@ -470,7 +484,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		if (!page)
 			break;
 
-		if (add_to_page_cache_lru(page, mapping, page_index,
+		if (add_to_page_cache_lru(page, mapping, pg_index,
 								GFP_NOFS)) {
 			page_cache_release(page);
 			goto next;
@@ -483,7 +497,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 * sure they map to this compressed extent on disk.
 		 */
 		set_page_extent_mapped(page);
-		lock_extent(tree, last_offset, end, GFP_NOFS);
+		lock_extent(tree, last_offset, end);
 		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, last_offset,
 					   PAGE_CACHE_SIZE);
@@ -491,9 +505,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
 		if (!em || last_offset < em->start ||
 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
-		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
 			free_extent_map(em);
-			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
 			page_cache_release(page);
 			break;
@@ -507,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			if (zero_offset) {
 				int zeros;
 				zeros = PAGE_CACHE_SIZE - zero_offset;
-				userpage = kmap_atomic(page, KM_USER0);
+				userpage = kmap_atomic(page);
 				memset(userpage + zero_offset, 0, zeros);
 				flush_dcache_page(page);
-				kunmap_atomic(userpage, KM_USER0);
+				kunmap_atomic(userpage);
 			}
 		}
 
@@ -521,7 +535,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			nr_pages++;
 			page_cache_release(page);
 		} else {
-			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
 			page_cache_release(page);
 			break;
@@ -537,7 +551,7 @@ next:
  * in it.  We don't actually do IO on those pages but allocate new ones
  * to hold the compressed pages on disk.
  *
- * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
  * bio->bi_io_vec points to all of the inode pages
  * bio->bi_vcnt is a count of pages
  *
@@ -554,15 +568,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
 	unsigned long compressed_len;
 	unsigned long nr_pages;
-	unsigned long page_index;
+	unsigned long pg_index;
 	struct page *page;
 	struct block_device *bdev;
 	struct bio *comp_bio;
-	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
 	int ret = -ENOMEM;
+	int faili = 0;
 	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
@@ -574,6 +589,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				   page_offset(bio->bi_io_vec->bv_page),
 				   PAGE_CACHE_SIZE);
 	read_unlock(&em_tree->lock);
+	if (!em)
+		return -EIO;
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -607,15 +624,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-	for (page_index = 0; page_index < nr_pages; page_index++) {
-		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
 							      __GFP_HIGHMEM);
-		if (!cb->compressed_pages[page_index])
+		if (!cb->compressed_pages[pg_index]) {
+			faili = pg_index - 1;
+			ret = -ENOMEM;
 			goto fail2;
+		}
 	}
+	faili = nr_pages - 1;
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, em_start + em_len, cb);
+	/* In the parent-locked case, we only locked the range we are
+	 * interested in.  In all other cases, we can opportunistically
+	 * cache decompressed data that goes beyond the requested range. */
+	if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
+		add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
@@ -628,13 +653,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	comp_bio->bi_end_io = end_compressed_bio_read;
 	atomic_inc(&cb->pending_bios);
 
-	for (page_index = 0; page_index < nr_pages; page_index++) {
-		page = cb->compressed_pages[page_index];
+	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		page = cb->compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
 		page->index = em_start >> PAGE_CACHE_SHIFT;
 
-		if (comp_bio->bi_size)
-			ret = tree->ops->merge_bio_hook(page, 0,
+		if (comp_bio->bi_iter.bi_size)
+			ret = tree->ops->merge_bio_hook(READ, page, 0,
 							PAGE_CACHE_SIZE,
 							comp_bio, 0);
 		else
@@ -646,7 +671,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			bio_get(comp_bio);
 
 			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM */
 
 			/*
 			 * inc the count before we submit the bio so
@@ -657,20 +682,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			atomic_inc(&cb->pending_bios);
 
 			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-				btrfs_lookup_bio_sums(root, inode, comp_bio,
-						      sums);
+				ret = btrfs_lookup_bio_sums(root, inode,
+							comp_bio, sums);
+				BUG_ON(ret); /* -ENOMEM */
 			}
-			sums += (comp_bio->bi_size + root->sectorsize - 1) /
-				root->sectorsize;
+			sums += (comp_bio->bi_iter.bi_size +
+				 root->sectorsize - 1) / root->sectorsize;
 
 			ret = btrfs_map_bio(root, READ, comp_bio,
 					    mirror_num, 0);
-			BUG_ON(ret);
+			if (ret)
+				bio_endio(comp_bio, ret);
 
 			bio_put(comp_bio);
 
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
+			BUG_ON(!comp_bio);
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
 
@@ -681,20 +709,25 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	bio_get(comp_bio);
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
-	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
-		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+		BUG_ON(ret); /* -ENOMEM */
+	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-	BUG_ON(ret);
+	if (ret)
+		bio_endio(comp_bio, ret);
 
 	bio_put(comp_bio);
 	return 0;
 
 fail2:
-	for (page_index = 0; page_index < nr_pages; page_index++)
-		free_page((unsigned long)cb->compressed_pages[page_index]);
+	while (faili >= 0) {
+		__free_page(cb->compressed_pages[faili]);
+		faili--;
+	}
 
 	kfree(cb->compressed_pages);
 fail1:
@@ -710,12 +743,12 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
 static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
 static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
 
-struct btrfs_compress_op *btrfs_compress_op[] = {
+static struct btrfs_compress_op *btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
 };
 
-int __init btrfs_init_compress(void)
+void __init btrfs_init_compress(void)
 {
 	int i;
 
@@ -725,7 +758,6 @@ int __init btrfs_init_compress(void)
 		atomic_set(&comp_alloc_workspace[i], 0);
 		init_waitqueue_head(&comp_workspace_wait[i]);
 	}
-	return 0;
 }
 
 /*
@@ -789,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace)
 
 	spin_lock(workspace_lock);
 	if (*num_workspace < num_online_cpus()) {
-		list_add_tail(workspace, idle_workspace);
+		list_add(workspace, idle_workspace);
 		(*num_workspace)++;
 		spin_unlock(workspace_lock);
 		goto wake;
@@ -799,6 +831,7 @@ static void free_workspace(int type, struct list_head *workspace)
 	btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(alloc_workspace);
 wake:
+	smp_mb();
 	if (waitqueue_active(workspace_wait))
 		wake_up(workspace_wait);
 }
@@ -854,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
 
 	workspace = find_workspace(type);
 	if (IS_ERR(workspace))
-		return -1;
+		return PTR_ERR(workspace);
 
 	ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
 						      start, len, pages,
@@ -881,15 +914,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
  * be contiguous.  They all correspond to the range of bytes covered by
  * the compressed extent.
  */
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-			    struct bio_vec *bvec, int vcnt, size_t srclen)
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen)
 {
 	struct list_head *workspace;
 	int ret;
 
 	workspace = find_workspace(type);
 	if (IS_ERR(workspace))
-		return -ENOMEM;
+		return PTR_ERR(workspace);
 
 	ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
 							 disk_start,
@@ -911,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 
 	workspace = find_workspace(type);
 	if (IS_ERR(workspace))
-		return -ENOMEM;
+		return PTR_ERR(workspace);
 
 	ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
 						  dest_page, start_byte,
@@ -936,7 +970,7 @@ void btrfs_exit_compress(void)
 int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 			      unsigned long total_out, u64 disk_start,
 			      struct bio_vec *bvec, int vcnt,
-			      unsigned long *page_index,
+			      unsigned long *pg_index,
 			      unsigned long *pg_offset)
 {
 	unsigned long buf_offset;
@@ -945,7 +979,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 	unsigned long working_bytes = total_out - buf_start;
 	unsigned long bytes;
 	char *kaddr;
-	struct page *page_out = bvec[*page_index].bv_page;
+	struct page *page_out = bvec[*pg_index].bv_page;
 
 	/*
 	 * start byte is the first byte of the page we're currently
@@ -974,9 +1008,11 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
 			    PAGE_CACHE_SIZE - buf_offset);
 		bytes = min(bytes, working_bytes);
-		kaddr = kmap_atomic(page_out, KM_USER0);
+		kaddr = kmap_atomic(page_out);
 		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
-		kunmap_atomic(kaddr, KM_USER0);
+		if (*pg_index == (vcnt - 1) && *pg_offset == 0)
+			memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+		kunmap_atomic(kaddr);
 		flush_dcache_page(page_out);
 
 		*pg_offset += bytes;
@@ -986,11 +1022,11 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 
 		/* check if we need to pick another page */
 		if (*pg_offset == PAGE_CACHE_SIZE) {
-			(*page_index)++;
-			if (*page_index >= vcnt)
+			(*pg_index)++;
+			if (*pg_index >= vcnt)
 				return 0;
 
-			page_out = bvec[*page_index].bv_page;
+			page_out = bvec[*pg_index].bv_page;
 			*pg_offset = 0;
 			start_byte = page_offset(page_out) - disk_start;
 
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 51000174b9d..0c803b4fbf9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,7 +19,7 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
 
-int btrfs_init_compress(void);
+void btrfs_init_compress(void);
 void btrfs_exit_compress(void);
 
 int btrfs_compress_pages(int type, struct address_space *mapping,
@@ -30,14 +30,12 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
 			 unsigned long *total_in,
 			 unsigned long *total_out,
 			 unsigned long max_out);
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-			    struct bio_vec *bvec, int vcnt, size_t srclen);
 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen);
 int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 			      unsigned long total_out, u64 disk_start,
 			      struct bio_vec *bvec, int vcnt,
-			      unsigned long *page_index,
+			      unsigned long *pg_index,
 			      unsigned long *pg_offset);
 
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0dccf..aeab453b8e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -36,20 +37,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct btrfs_path *path, int level, int slot);
-static int setup_items_for_insert(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *cpu_key, u32 *data_size,
-			u32 total_data, u32 total_size, int nr);
-
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot);
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+				 struct extent_buffer *eb);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-	if (path)
-		path->reada = 1;
 	return path;
 }
 
@@ -61,8 +57,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 {
 	int i;
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-		if (p->nodes[i] && p->locks[i])
-			btrfs_set_lock_blocking(p->nodes[i]);
+		if (!p->nodes[i] || !p->locks[i])
+			continue;
+		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
+		if (p->locks[i] == BTRFS_READ_LOCK)
+			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+		else if (p->locks[i] == BTRFS_WRITE_LOCK)
+			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
 	}
 }
 
@@ -75,7 +76,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  * for held
  */
 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
-					struct extent_buffer *held)
+					struct extent_buffer *held, int held_rw)
 {
 	int i;
 
@@ -86,19 +87,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 	 * really sure by forcing the path to blocking before we clear
 	 * the path blocking.
 	 */
-	if (held)
-		btrfs_set_lock_blocking(held);
+	if (held) {
+		btrfs_set_lock_blocking_rw(held, held_rw);
+		if (held_rw == BTRFS_WRITE_LOCK)
+			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
+		else if (held_rw == BTRFS_READ_LOCK)
+			held_rw = BTRFS_READ_LOCK_BLOCKING;
+	}
 	btrfs_set_path_blocking(p);
 #endif
 
 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
-		if (p->nodes[i] && p->locks[i])
-			btrfs_clear_lock_blocking(p->nodes[i]);
+		if (p->nodes[i] && p->locks[i]) {
+			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
+			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
+				p->locks[i] = BTRFS_WRITE_LOCK;
+			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
+				p->locks[i] = BTRFS_READ_LOCK;
+		}
 	}
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (held)
-		btrfs_clear_lock_blocking(held);
+		btrfs_clear_lock_blocking_rw(held, held_rw);
 #endif
 }
 
@@ -107,7 +118,7 @@ void btrfs_free_path(struct btrfs_path *p)
 {
 	if (!p)
 		return;
-	btrfs_release_path(NULL, p);
+	btrfs_release_path(p);
 	kmem_cache_free(btrfs_path_cachep, p);
 }
 
@@ -117,7 +128,7 @@ void btrfs_free_path(struct btrfs_path *p)
  *
  * It is safe to call this on paths that no locks or extent buffers held.
  */
-noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_path *p)
 {
 	int i;
 
@@ -126,7 +137,7 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 		if (!p->nodes[i])
 			continue;
 		if (p->locks[i]) {
-			btrfs_tree_unlock(p->nodes[i]);
+			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 			p->locks[i] = 0;
 		}
 		free_extent_buffer(p->nodes[i]);
@@ -147,10 +158,24 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
-	spin_lock(&root->node_lock);
-	eb = root->node;
-	extent_buffer_get(eb);
-	spin_unlock(&root->node_lock);
+
+	while (1) {
+		rcu_read_lock();
+		eb = rcu_dereference(root->node);
+
+		/*
+		 * RCU really hurts here, we could free up the root node because
+		 * it was cow'ed but we may not get the new root node yet so do
+		 * the inc_not_zero dance and if it doesn't work then
+		 * synchronize_rcu and try again.
+		 */
+		if (atomic_inc_not_zero(&eb->refs)) {
+			rcu_read_unlock();
+			break;
+		}
+		rcu_read_unlock();
+		synchronize_rcu();
+	}
 	return eb;
 }
 
@@ -165,30 +190,46 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 	while (1) {
 		eb = btrfs_root_node(root);
 		btrfs_tree_lock(eb);
-
-		spin_lock(&root->node_lock);
-		if (eb == root->node) {
-			spin_unlock(&root->node_lock);
+		if (eb == root->node)
 			break;
-		}
-		spin_unlock(&root->node_lock);
-
 		btrfs_tree_unlock(eb);
 		free_extent_buffer(eb);
 	}
 	return eb;
 }
 
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while (1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_read_lock(eb);
+		if (eb == root->node)
+			break;
+		btrfs_tree_read_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
 /* cowonly root (everything not a reference counted cow subvolume), just get
  * put onto a simple dirty list.  transaction.c walks this to make sure they
  * get properly updated on disk.
  */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
-	if (root->track_dirty && list_empty(&root->dirty_list)) {
+	spin_lock(&root->fs_info->trans_lock);
+	if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
+	    list_empty(&root->dirty_list)) {
 		list_add(&root->dirty_list,
 			 &root->fs_info->dirty_cowonly_roots);
 	}
+	spin_unlock(&root->fs_info->trans_lock);
 }
 
 /*
@@ -206,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	int level;
 	struct btrfs_disk_key disk_key;
 
-	WARN_ON(root->ref_cows && trans->transid !=
-		root->fs_info->running_transaction->transid);
-	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->fs_info->running_transaction->transid);
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
 	if (level == 0)
@@ -233,15 +275,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		ret = btrfs_inc_ref(trans, root, cow, 1);
+		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 	else
-		ret = btrfs_inc_ref(trans, root, cow, 0);
+		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
 	if (ret)
 		return ret;
@@ -251,6 +292,666 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+enum mod_log_op {
+	MOD_LOG_KEY_REPLACE,
+	MOD_LOG_KEY_ADD,
+	MOD_LOG_KEY_REMOVE,
+	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+	MOD_LOG_MOVE_KEYS,
+	MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+	int dst_slot;
+	int nr_items;
+};
+
+struct tree_mod_root {
+	u64 logical;
+	u8 level;
+};
+
+struct tree_mod_elem {
+	struct rb_node node;
+	u64 index;		/* shifted logical */
+	u64 seq;
+	enum mod_log_op op;
+
+	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+	int slot;
+
+	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+	u64 generation;
+
+	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+	struct btrfs_disk_key key;
+	u64 blockptr;
+
+	/* this is used for op == MOD_LOG_MOVE_KEYS */
+	struct tree_mod_move move;
+
+	/* this is used for op == MOD_LOG_ROOT_REPLACE */
+	struct tree_mod_root old_root;
+};
+
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
+{
+	read_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+	read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+	write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+	write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+	return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem)
+{
+	tree_mod_log_write_lock(fs_info);
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!elem->seq) {
+		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	tree_mod_log_write_unlock(fs_info);
+
+	return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct rb_node *next;
+	struct seq_list *cur_elem;
+	struct tree_mod_elem *tm;
+	u64 min_seq = (u64)-1;
+	u64 seq_putting = elem->seq;
+
+	if (!seq_putting)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	list_del(&elem->list);
+	elem->seq = 0;
+
+	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+		if (cur_elem->seq < min_seq) {
+			if (seq_putting > cur_elem->seq) {
+				/*
+				 * blocker with lower sequence number exists, we
+				 * cannot remove anything from the log
+				 */
+				spin_unlock(&fs_info->tree_mod_seq_lock);
+				return;
+			}
+			min_seq = cur_elem->seq;
+		}
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	/*
+	 * anything that's lower than the lowest existing (read: blocked)
+	 * sequence number can be removed from the tree.
+	 */
+	tree_mod_log_write_lock(fs_info);
+	tm_root = &fs_info->tree_mod_log;
+	for (node = rb_first(tm_root); node; node = next) {
+		next = rb_next(node);
+		tm = container_of(node, struct tree_mod_elem, node);
+		if (tm->seq > min_seq)
+			continue;
+		rb_erase(node, tm_root);
+		kfree(tm);
+	}
+	tree_mod_log_write_unlock(fs_info);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+	struct rb_root *tm_root;
+	struct rb_node **new;
+	struct rb_node *parent = NULL;
+	struct tree_mod_elem *cur;
+
+	BUG_ON(!tm);
+
+	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+	tm_root = &fs_info->tree_mod_log;
+	new = &tm_root->rb_node;
+	while (*new) {
+		cur = container_of(*new, struct tree_mod_elem, node);
+		parent = *new;
+		if (cur->index < tm->index)
+			new = &((*new)->rb_left);
+		else if (cur->index > tm->index)
+			new = &((*new)->rb_right);
+		else if (cur->seq < tm->seq)
+			new = &((*new)->rb_left);
+		else if (cur->seq > tm->seq)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&tm->node, parent, new);
+	rb_insert_color(&tm->node, tm_root);
+	return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+				    struct extent_buffer *eb) {
+	smp_mb();
+	if (list_empty(&(fs_info)->tree_mod_seq_list))
+		return 1;
+	if (eb && btrfs_header_level(eb) == 0)
+		return 1;
+
+	tree_mod_log_write_lock(fs_info);
+	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+		tree_mod_log_write_unlock(fs_info);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+				    struct extent_buffer *eb)
+{
+	smp_mb();
+	if (list_empty(&(fs_info)->tree_mod_seq_list))
+		return 0;
+	if (eb && btrfs_header_level(eb) == 0)
+		return 0;
+
+	return 1;
+}
+
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+		    enum mod_log_op op, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return NULL;
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	if (op != MOD_LOG_KEY_ADD) {
+		btrfs_node_key(eb, &tm->key, slot);
+		tm->blockptr = btrfs_node_blockptr(eb, slot);
+	}
+	tm->op = op;
+	tm->slot = slot;
+	tm->generation = btrfs_node_ptr_generation(eb, slot);
+	RB_CLEAR_NODE(&tm->node);
+
+	return tm;
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+			struct extent_buffer *eb, int slot,
+			enum mod_log_op op, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+	int ret;
+
+	if (!tree_mod_need_log(fs_info, eb))
+		return 0;
+
+	tm = alloc_tree_mod_elem(eb, slot, op, flags);
+	if (!tm)
+		return -ENOMEM;
+
+	if (tree_mod_dont_log(fs_info, eb)) {
+		kfree(tm);
+		return 0;
+	}
+
+	ret = __tree_mod_log_insert(fs_info, tm);
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		kfree(tm);
+
+	return ret;
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *eb, int dst_slot, int src_slot,
+			 int nr_items, gfp_t flags)
+{
+	struct tree_mod_elem *tm = NULL;
+	struct tree_mod_elem **tm_list = NULL;
+	int ret = 0;
+	int i;
+	int locked = 0;
+
+	if (!tree_mod_need_log(fs_info, eb))
+		return 0;
+
+	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+	if (!tm_list)
+		return -ENOMEM;
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm) {
+		ret = -ENOMEM;
+		goto free_tms;
+	}
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	tm->slot = src_slot;
+	tm->move.dst_slot = dst_slot;
+	tm->move.nr_items = nr_items;
+	tm->op = MOD_LOG_MOVE_KEYS;
+
+	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+		if (!tm_list[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, eb))
+		goto free_tms;
+	locked = 1;
+
+	/*
+	 * When we override something during the move, we log these removals.
+	 * This can only happen when we move towards the beginning of the
+	 * buffer, i.e. dst_slot < src_slot.
+	 */
+	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		if (ret)
+			goto free_tms;
+	}
+
+	ret = __tree_mod_log_insert(fs_info, tm);
+	if (ret)
+		goto free_tms;
+	tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return 0;
+free_tms:
+	for (i = 0; i < nr_items; i++) {
+		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+		kfree(tm_list[i]);
+	}
+	if (locked)
+		tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+	kfree(tm);
+
+	return ret;
+}
+
+static inline int
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+		       struct tree_mod_elem **tm_list,
+		       int nritems)
+{
+	int i, j;
+	int ret;
+
+	for (i = nritems - 1; i >= 0; i--) {
+		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		if (ret) {
+			for (j = nritems - 1; j > i; j--)
+				rb_erase(&tm_list[j]->node,
+					 &fs_info->tree_mod_log);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *old_root,
+			 struct extent_buffer *new_root, gfp_t flags,
+			 int log_removal)
+{
+	struct tree_mod_elem *tm = NULL;
+	struct tree_mod_elem **tm_list = NULL;
+	int nritems = 0;
+	int ret = 0;
+	int i;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	if (log_removal && btrfs_header_level(old_root) > 0) {
+		nritems = btrfs_header_nritems(old_root);
+		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+				  flags);
+		if (!tm_list) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+		for (i = 0; i < nritems; i++) {
+			tm_list[i] = alloc_tree_mod_elem(old_root, i,
+			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+			if (!tm_list[i]) {
+				ret = -ENOMEM;
+				goto free_tms;
+			}
+		}
+	}
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm) {
+		ret = -ENOMEM;
+		goto free_tms;
+	}
+
+	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+	tm->old_root.logical = old_root->start;
+	tm->old_root.level = btrfs_header_level(old_root);
+	tm->generation = btrfs_header_generation(old_root);
+	tm->op = MOD_LOG_ROOT_REPLACE;
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		goto free_tms;
+
+	if (tm_list)
+		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+	if (!ret)
+		ret = __tree_mod_log_insert(fs_info, tm);
+
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		goto free_tms;
+	kfree(tm_list);
+
+	return ret;
+
+free_tms:
+	if (tm_list) {
+		for (i = 0; i < nritems; i++)
+			kfree(tm_list[i]);
+		kfree(tm_list);
+	}
+	kfree(tm);
+
+	return ret;
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+		      int smallest)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct tree_mod_elem *cur = NULL;
+	struct tree_mod_elem *found = NULL;
+	u64 index = start >> PAGE_CACHE_SHIFT;
+
+	tree_mod_log_read_lock(fs_info);
+	tm_root = &fs_info->tree_mod_log;
+	node = tm_root->rb_node;
+	while (node) {
+		cur = container_of(node, struct tree_mod_elem, node);
+		if (cur->index < index) {
+			node = node->rb_left;
+		} else if (cur->index > index) {
+			node = node->rb_right;
+		} else if (cur->seq < min_seq) {
+			node = node->rb_left;
+		} else if (!smallest) {
+			/* we want the node with the highest seq */
+			if (found)
+				BUG_ON(found->seq > cur->seq);
+			found = cur;
+			node = node->rb_left;
+		} else if (cur->seq > min_seq) {
+			/* we want the node with the smallest seq */
+			if (found)
+				BUG_ON(found->seq < cur->seq);
+			found = cur;
+			node = node->rb_right;
+		} else {
+			found = cur;
+			break;
+		}
+	}
+	tree_mod_log_read_unlock(fs_info);
+
+	return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+			   u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static noinline int
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+		     struct extent_buffer *src, unsigned long dst_offset,
+		     unsigned long src_offset, int nr_items)
+{
+	int ret = 0;
+	struct tree_mod_elem **tm_list = NULL;
+	struct tree_mod_elem **tm_list_add, **tm_list_rem;
+	int i;
+	int locked = 0;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+		return 0;
+
+	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+			  GFP_NOFS);
+	if (!tm_list)
+		return -ENOMEM;
+
+	tm_list_add = tm_list;
+	tm_list_rem = tm_list + nr_items;
+	for (i = 0; i < nr_items; i++) {
+		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		if (!tm_list_rem[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+
+		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+		    MOD_LOG_KEY_ADD, GFP_NOFS);
+		if (!tm_list_add[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		goto free_tms;
+	locked = 1;
+
+	for (i = 0; i < nr_items; i++) {
+		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+		if (ret)
+			goto free_tms;
+		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+		if (ret)
+			goto free_tms;
+	}
+
+	tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return 0;
+
+free_tms:
+	for (i = 0; i < nr_items * 2; i++) {
+		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+		kfree(tm_list[i]);
+	}
+	if (locked)
+		tree_mod_log_write_unlock(fs_info);
+	kfree(tm_list);
+
+	return ret;
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+		     int dst_offset, int src_offset, int nr_items)
+{
+	int ret;
+	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+				       nr_items, GFP_NOFS);
+	BUG_ON(ret < 0);
+}
+
+static noinline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *eb, int slot, int atomic)
+{
+	int ret;
+
+	ret = tree_mod_log_insert_key(fs_info, eb, slot,
+					MOD_LOG_KEY_REPLACE,
+					atomic ? GFP_ATOMIC : GFP_NOFS);
+	BUG_ON(ret < 0);
+}
+
+static noinline int
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+	struct tree_mod_elem **tm_list = NULL;
+	int nritems = 0;
+	int i;
+	int ret = 0;
+
+	if (btrfs_header_level(eb) == 0)
+		return 0;
+
+	if (!tree_mod_need_log(fs_info, NULL))
+		return 0;
+
+	nritems = btrfs_header_nritems(eb);
+	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+			  GFP_NOFS);
+	if (!tm_list)
+		return -ENOMEM;
+
+	for (i = 0; i < nritems; i++) {
+		tm_list[i] = alloc_tree_mod_elem(eb, i,
+		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+		if (!tm_list[i]) {
+			ret = -ENOMEM;
+			goto free_tms;
+		}
+	}
+
+	if (tree_mod_dont_log(fs_info, eb))
+		goto free_tms;
+
+	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+	tree_mod_log_write_unlock(fs_info);
+	if (ret)
+		goto free_tms;
+	kfree(tm_list);
+
+	return 0;
+
+free_tms:
+	for (i = 0; i < nritems; i++)
+		kfree(tm_list[i]);
+	kfree(tm_list);
+
+	return ret;
+}
+
+static noinline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+			      struct extent_buffer *new_root_node,
+			      int log_removal)
+{
+	int ret;
+	ret = tree_mod_log_insert_root(root->fs_info, root->node,
+				       new_root_node, GFP_NOFS, log_removal);
+	BUG_ON(ret < 0);
+}
+
 /*
  * check if the tree block can be shared by multiple trees
  */
@@ -263,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 	 * snapshot and the block was not allocated by tree relocation,
 	 * we know the block is not shared.
 	 */
-	if (root->ref_cows &&
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 	    buf != root->node && buf != root->commit_root &&
 	    (btrfs_header_generation(buf) <=
 	     btrfs_root_last_snapshot(&root->root_item) ||
 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 		return 1;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-	if (root->ref_cows &&
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 		return 1;
 #endif
@@ -308,9 +1009,15 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
 	if (btrfs_block_can_be_shared(root, buf)) {
 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
-					       buf->len, &refs, &flags);
-		BUG_ON(ret);
-		BUG_ON(refs == 0);
+					       btrfs_header_level(buf), 1,
+					       &refs, &flags);
+		if (ret)
+			return ret;
+		if (refs == 0) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			return ret;
+		}
 	} else {
 		refs = 1;
 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
@@ -328,43 +1035,46 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 		if ((owner == root->root_key.objectid ||
 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-			ret = btrfs_inc_ref(trans, root, buf, 1);
-			BUG_ON(ret);
+			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+			BUG_ON(ret); /* -ENOMEM */
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID) {
-				ret = btrfs_dec_ref(trans, root, buf, 0);
-				BUG_ON(ret);
-				ret = btrfs_inc_ref(trans, root, cow, 1);
-				BUG_ON(ret);
+				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+				BUG_ON(ret); /* -ENOMEM */
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+				BUG_ON(ret); /* -ENOMEM */
 			}
 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 		} else {
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+			BUG_ON(ret); /* -ENOMEM */
 		}
 		if (new_flags != 0) {
+			int level = btrfs_header_level(buf);
+
 			ret = btrfs_set_disk_extent_flags(trans, root,
 							  buf->start,
 							  buf->len,
-							  new_flags, 0);
-			BUG_ON(ret);
+							  new_flags, level, 0);
+			if (ret)
+				return ret;
 		}
 	} else {
 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret);
-			ret = btrfs_dec_ref(trans, root, buf, 1);
-			BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+			BUG_ON(ret); /* -ENOMEM */
+			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+			BUG_ON(ret); /* -ENOMEM */
 		}
 		clean_tree_block(trans, root, buf);
 		*last_ref = 1;
@@ -393,7 +1103,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *cow;
-	int level;
+	int level, ret;
 	int last_ref = 0;
 	int unlock_orig = 0;
 	u64 parent_start;
@@ -403,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	btrfs_assert_tree_locked(buf);
 
-	WARN_ON(root->ref_cows && trans->transid !=
-		root->fs_info->running_transaction->transid);
-	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->fs_info->running_transaction->transid);
+	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+		trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
 
@@ -441,14 +1152,20 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
-	update_ref_for_cow(trans, root, buf, cow, &last_ref);
+	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 
-	if (root->ref_cows)
-		btrfs_reloc_cow_block(trans, root, buf, cow);
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+		if (ret)
+			return ret;
+	}
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
@@ -458,10 +1175,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		else
 			parent_start = 0;
 
-		spin_lock(&root->node_lock);
-		root->node = cow;
 		extent_buffer_get(cow);
-		spin_unlock(&root->node_lock);
+		tree_mod_log_set_root_pointer(root, cow, 1);
+		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
 				      last_ref);
@@ -474,30 +1190,345 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			parent_start = 0;
 
 		WARN_ON(trans->transid != btrfs_header_generation(parent));
+		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
+		if (last_ref) {
+			ret = tree_mod_log_free_eb(root->fs_info, buf);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
+		}
 		btrfs_free_tree_block(trans, root, buf, parent_start,
 				      last_ref);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
-	free_extent_buffer(buf);
+	free_extent_buffer_stale(buf);
 	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
 	return 0;
 }
 
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *eb_root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	struct tree_mod_elem *found = NULL;
+	u64 root_logical = eb_root->start;
+	int looped = 0;
+
+	if (!time_seq)
+		return NULL;
+
+	/*
+	 * the very last operation that's logged for a root is the replacement
+	 * operation (if it is replaced at all). this has the index of the *new*
+	 * root, making it the very first operation that's logged for this root.
+	 */
+	while (1) {
+		tm = tree_mod_log_search_oldest(fs_info, root_logical,
+						time_seq);
+		if (!looped && !tm)
+			return NULL;
+		/*
+		 * if there are no tree operation for the oldest root, we simply
+		 * return it. this should only happen if that (old) root is at
+		 * level 0.
+		 */
+		if (!tm)
+			break;
+
+		/*
+		 * if there's an operation that's not a root replacement, we
+		 * found the oldest version of our root. normally, we'll find a
+		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+		 */
+		if (tm->op != MOD_LOG_ROOT_REPLACE)
+			break;
+
+		found = tm;
+		root_logical = tm->old_root.logical;
+		looped = 1;
+	}
+
+	/* if there's no old root to return, return what we found instead */
+	if (!found)
+		found = tm;
+
+	return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+		      u64 time_seq, struct tree_mod_elem *first_tm)
+{
+	u32 n;
+	struct rb_node *next;
+	struct tree_mod_elem *tm = first_tm;
+	unsigned long o_dst;
+	unsigned long o_src;
+	unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+	n = btrfs_header_nritems(eb);
+	tree_mod_log_read_lock(fs_info);
+	while (tm && tm->seq >= time_seq) {
+		/*
+		 * all the operations are recorded with the operator used for
+		 * the modification. as we're going backwards, we do the
+		 * opposite of each operation here.
+		 */
+		switch (tm->op) {
+		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+			BUG_ON(tm->slot < n);
+			/* Fallthrough */
+		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+		case MOD_LOG_KEY_REMOVE:
+			btrfs_set_node_key(eb, &tm->key, tm->slot);
+			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+			btrfs_set_node_ptr_generation(eb, tm->slot,
+						      tm->generation);
+			n++;
+			break;
+		case MOD_LOG_KEY_REPLACE:
+			BUG_ON(tm->slot >= n);
+			btrfs_set_node_key(eb, &tm->key, tm->slot);
+			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+			btrfs_set_node_ptr_generation(eb, tm->slot,
+						      tm->generation);
+			break;
+		case MOD_LOG_KEY_ADD:
+			/* if a move operation is needed it's in the log */
+			n--;
+			break;
+		case MOD_LOG_MOVE_KEYS:
+			o_dst = btrfs_node_key_ptr_offset(tm->slot);
+			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+			memmove_extent_buffer(eb, o_dst, o_src,
+					      tm->move.nr_items * p_size);
+			break;
+		case MOD_LOG_ROOT_REPLACE:
+			/*
+			 * this operation is special. for roots, this must be
+			 * handled explicitly before rewinding.
+			 * for non-roots, this operation may exist if the node
+			 * was a root: root A -> child B; then A gets empty and
+			 * B is promoted to the new root. in the mod log, we'll
+			 * have a root-replace operation for B, a tree block
+			 * that is no root. we simply ignore that operation.
+			 */
+			break;
+		}
+		next = rb_next(&tm->node);
+		if (!next)
+			break;
+		tm = container_of(next, struct tree_mod_elem, node);
+		if (tm->index != first_tm->index)
+			break;
+	}
+	tree_mod_log_read_unlock(fs_info);
+	btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		    struct extent_buffer *eb, u64 time_seq)
+{
+	struct extent_buffer *eb_rewin;
+	struct tree_mod_elem *tm;
+
+	if (!time_seq)
+		return eb;
+
+	if (btrfs_header_level(eb) == 0)
+		return eb;
+
+	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+	if (!tm)
+		return eb;
+
+	btrfs_set_path_blocking(path);
+	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		BUG_ON(tm->slot != 0);
+		eb_rewin = alloc_dummy_extent_buffer(eb->start,
+						fs_info->tree_root->nodesize);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
+		btrfs_set_header_bytenr(eb_rewin, eb->start);
+		btrfs_set_header_backref_rev(eb_rewin,
+					     btrfs_header_backref_rev(eb));
+		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+	} else {
+		eb_rewin = btrfs_clone_extent_buffer(eb);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
+	}
+
+	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
+	btrfs_tree_read_unlock_blocking(eb);
+	free_extent_buffer(eb);
+
+	extent_buffer_get(eb_rewin);
+	btrfs_tree_read_lock(eb_rewin);
+	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+	WARN_ON(btrfs_header_nritems(eb_rewin) >
+		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
+
+	return eb_rewin;
+}
+
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	struct extent_buffer *eb = NULL;
+	struct extent_buffer *eb_root;
+	struct extent_buffer *old;
+	struct tree_mod_root *old_root = NULL;
+	u64 old_generation = 0;
+	u64 logical;
+	u32 blocksize;
+
+	eb_root = btrfs_read_lock_root_node(root);
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	if (!tm)
+		return eb_root;
+
+	if (tm->op == MOD_LOG_ROOT_REPLACE) {
+		old_root = &tm->old_root;
+		old_generation = tm->generation;
+		logical = old_root->logical;
+	} else {
+		logical = eb_root->start;
+	}
+
+	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
+		blocksize = btrfs_level_size(root, old_root->level);
+		old = read_tree_block(root, logical, blocksize, 0);
+		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
+			free_extent_buffer(old);
+			btrfs_warn(root->fs_info,
+				"failed to read tree block %llu from get_old_root", logical);
+		} else {
+			eb = btrfs_clone_extent_buffer(old);
+			free_extent_buffer(old);
+		}
+	} else if (old_root) {
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
+		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+	} else {
+		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+		eb = btrfs_clone_extent_buffer(eb_root);
+		btrfs_tree_read_unlock_blocking(eb_root);
+		free_extent_buffer(eb_root);
+	}
+
+	if (!eb)
+		return NULL;
+	extent_buffer_get(eb);
+	btrfs_tree_read_lock(eb);
+	if (old_root) {
+		btrfs_set_header_bytenr(eb, eb->start);
+		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+		btrfs_set_header_level(eb, old_root->level);
+		btrfs_set_header_generation(eb, old_generation);
+	}
+	if (tm)
+		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
+	else
+		WARN_ON(btrfs_header_level(eb) != 0);
+	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
+
+	return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	int level;
+	struct extent_buffer *eb_root = btrfs_root_node(root);
+
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+		level = tm->old_root.level;
+	} else {
+		level = btrfs_header_level(eb_root);
+	}
+	free_extent_buffer(eb_root);
+
+	return level;
+}
+
 static inline int should_cow_block(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct extent_buffer *buf)
 {
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+		return 0;
+#endif
+	/* ensure we can see the force_cow */
+	smp_rmb();
+
+	/*
+	 * We do not need to cow a block if
+	 * 1) this block is not created or changed in this transaction;
+	 * 2) this block does not belong to TREE_RELOC tree;
+	 * 3) the root is not forced COW.
+	 *
+	 * What is forced COW:
+	 *    when we create snapshot during commiting the transaction,
+	 *    after we've finished coping src root, we must COW the shared
+	 *    block to ensure the metadata consistency.
+	 */
 	if (btrfs_header_generation(buf) == trans->transid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
-	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
 		return 0;
 	return 1;
 }
@@ -515,19 +1546,14 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	u64 search_start;
 	int ret;
 
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %llu running %llu\n",
-		       (unsigned long long)trans->transid,
-		       (unsigned long long)
+	if (trans->transaction != root->fs_info->running_transaction)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
+		       trans->transid,
 		       root->fs_info->running_transaction->transid);
-		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %llu running %llu\n",
-		       (unsigned long long)trans->transid,
-		       (unsigned long long)root->fs_info->generation);
-		WARN_ON(1);
-	}
+
+	if (trans->transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
+		       trans->transid, root->fs_info->generation);
 
 	if (!should_cow_block(trans, root, buf)) {
 		*cow_ret = buf;
@@ -542,6 +1568,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
+
+	trace_btrfs_cow_block(root, buf, *cow_ret);
+
 	return ret;
 }
 
@@ -597,7 +1626,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
  */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, int cache_only, u64 *last_ret,
+		       int start_slot, u64 *last_ret,
 		       struct btrfs_key *progress)
 {
 	struct extent_buffer *cur;
@@ -617,13 +1646,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key disk_key;
 
 	parent_level = btrfs_header_level(parent);
-	if (cache_only && parent_level != 1)
-		return 0;
 
-	if (trans->transaction != root->fs_info->running_transaction)
-		WARN_ON(1);
-	if (trans->transid != root->fs_info->generation)
-		WARN_ON(1);
+	WARN_ON(trans->transaction != root->fs_info->running_transaction);
+	WARN_ON(trans->transid != root->fs_info->generation);
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -637,14 +1662,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
 
-		if (!parent->map_token) {
-			map_extent_buffer(parent,
-					btrfs_node_key_ptr_offset(i),
-					sizeof(struct btrfs_key_ptr),
-					&parent->map_token, &parent->kaddr,
-					&parent->map_start, &parent->map_len,
-					KM_USER1);
-		}
 		btrfs_node_key(parent, &disk_key, i);
 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
 			continue;
@@ -667,27 +1684,26 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			last_block = blocknr;
 			continue;
 		}
-		if (parent->map_token) {
-			unmap_extent_buffer(parent, parent->map_token,
-					    KM_USER1);
-			parent->map_token = NULL;
-		}
 
 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
 		if (cur)
-			uptodate = btrfs_buffer_uptodate(cur, gen);
+			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
 		else
 			uptodate = 0;
 		if (!cur || !uptodate) {
-			if (cache_only) {
-				free_extent_buffer(cur);
-				continue;
-			}
 			if (!cur) {
 				cur = read_tree_block(root, blocknr,
 							 blocksize, gen);
+				if (!cur || !extent_buffer_uptodate(cur)) {
+					free_extent_buffer(cur);
+					return -EIO;
+				}
 			} else if (!uptodate) {
-				btrfs_read_buffer(cur, gen);
+				err = btrfs_read_buffer(cur, gen);
+				if (err) {
+					free_extent_buffer(cur);
+					return err;
+				}
 			}
 		}
 		if (search_start == 0)
@@ -710,11 +1726,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		btrfs_tree_unlock(cur);
 		free_extent_buffer(cur);
 	}
-	if (parent->map_token) {
-		unmap_extent_buffer(parent, parent->map_token,
-				    KM_USER1);
-		parent->map_token = NULL;
-	}
 	return err;
 }
 
@@ -732,122 +1743,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
-/*
- * extra debugging checks to make sure all the items in a key are
- * well formed and in the proper order
- */
-static int check_node(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
-{
-	struct extent_buffer *parent = NULL;
-	struct extent_buffer *node = path->nodes[level];
-	struct btrfs_disk_key parent_key;
-	struct btrfs_disk_key node_key;
-	int parent_slot;
-	int slot;
-	struct btrfs_key cpukey;
-	u32 nritems = btrfs_header_nritems(node);
-
-	if (path->nodes[level + 1])
-		parent = path->nodes[level + 1];
-
-	slot = path->slots[level];
-	BUG_ON(nritems == 0);
-	if (parent) {
-		parent_slot = path->slots[level + 1];
-		btrfs_node_key(parent, &parent_key, parent_slot);
-		btrfs_node_key(node, &node_key, 0);
-		BUG_ON(memcmp(&parent_key, &node_key,
-			      sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(node));
-	}
-	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
-	if (slot != 0) {
-		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
-		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
-	}
-	if (slot < nritems - 1) {
-		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
-		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
-	}
-	return 0;
-}
-
-/*
- * extra checking to make sure all the items in a leaf are
- * well formed and in the proper order
- */
-static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
-{
-	struct extent_buffer *leaf = path->nodes[level];
-	struct extent_buffer *parent = NULL;
-	int parent_slot;
-	struct btrfs_key cpukey;
-	struct btrfs_disk_key parent_key;
-	struct btrfs_disk_key leaf_key;
-	int slot = path->slots[0];
-
-	u32 nritems = btrfs_header_nritems(leaf);
-
-	if (path->nodes[level + 1])
-		parent = path->nodes[level + 1];
-
-	if (nritems == 0)
-		return 0;
-
-	if (parent) {
-		parent_slot = path->slots[level + 1];
-		btrfs_node_key(parent, &parent_key, parent_slot);
-		btrfs_item_key(leaf, &leaf_key, 0);
-
-		BUG_ON(memcmp(&parent_key, &leaf_key,
-		       sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(leaf));
-	}
-	if (slot != 0 && slot < nritems - 1) {
-		btrfs_item_key(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
-		if (comp_keys(&leaf_key, &cpukey) <= 0) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad key\n", slot);
-			BUG_ON(1);
-		}
-		if (btrfs_item_offset_nr(leaf, slot - 1) !=
-		       btrfs_item_end_nr(leaf, slot)) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
-		}
-	}
-	if (slot < nritems - 1) {
-		btrfs_item_key(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
-		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
-		if (btrfs_item_offset_nr(leaf, slot) !=
-			btrfs_item_end_nr(leaf, slot + 1)) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
-		}
-	}
-	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
-	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
-	return 0;
-}
-
-static noinline int check_block(struct btrfs_root *root,
-				struct btrfs_path *path, int level)
-{
-	return 0;
-	if (level == 0)
-		return check_leaf(root, path, level);
-	return check_node(root, path, level);
-}
 
 /*
  * search for key in the extent_buffer.  The items start at offset p,
@@ -871,7 +1766,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 	struct btrfs_disk_key *tmp = NULL;
 	struct btrfs_disk_key unaligned;
 	unsigned long offset;
-	char *map_token = NULL;
 	char *kaddr = NULL;
 	unsigned long map_start = 0;
 	unsigned long map_len = 0;
@@ -881,18 +1775,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 		mid = (low + high) / 2;
 		offset = p + mid * item_size;
 
-		if (!map_token || offset < map_start ||
+		if (!kaddr || offset < map_start ||
 		    (offset + sizeof(struct btrfs_disk_key)) >
 		    map_start + map_len) {
-			if (map_token) {
-				unmap_extent_buffer(eb, map_token, KM_USER0);
-				map_token = NULL;
-			}
 
 			err = map_private_extent_buffer(eb, offset,
 						sizeof(struct btrfs_disk_key),
-						&map_token, &kaddr,
-						&map_start, &map_len, KM_USER0);
+						&kaddr, &map_start, &map_len);
 
 			if (!err) {
 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
@@ -915,14 +1804,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 			high = mid;
 		else {
 			*slot = mid;
-			if (map_token)
-				unmap_extent_buffer(eb, map_token, KM_USER0);
 			return 0;
 		}
 	}
 	*slot = low;
-	if (map_token)
-		unmap_extent_buffer(eb, map_token, KM_USER0);
 	return 1;
 }
 
@@ -933,20 +1818,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		      int level, int *slot)
 {
-	if (level == 0) {
+	if (level == 0)
 		return generic_bin_search(eb,
 					  offsetof(struct btrfs_leaf, items),
 					  sizeof(struct btrfs_item),
 					  key, btrfs_header_nritems(eb),
 					  slot);
-	} else {
+	else
 		return generic_bin_search(eb,
 					  offsetof(struct btrfs_node, ptrs),
 					  sizeof(struct btrfs_key_ptr),
 					  key, btrfs_header_nritems(eb),
 					  slot);
-	}
-	return -1;
 }
 
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -979,6 +1862,8 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
 	int level = btrfs_header_level(parent);
+	struct extent_buffer *eb;
+
 	if (slot < 0)
 		return NULL;
 	if (slot >= btrfs_header_nritems(parent))
@@ -986,9 +1871,15 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 
 	BUG_ON(level == 0);
 
-	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
-		       btrfs_level_size(root, level - 1),
-		       btrfs_node_ptr_generation(parent, slot));
+	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+			     btrfs_level_size(root, level - 1),
+			     btrfs_node_ptr_generation(parent, slot));
+	if (eb && !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		eb = NULL;
+	}
+
+	return eb;
 }
 
 /*
@@ -1015,14 +1906,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 	mid = path->nodes[level];
 
-	WARN_ON(!path->locks[level]);
+	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
+		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	/*
 	 * deal with the case where there is only one pointer in the root
@@ -1036,7 +1929,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
-		BUG_ON(!child);
+		if (!child) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			goto enospc;
+		}
+
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
@@ -1046,9 +1944,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			goto enospc;
 		}
 
-		spin_lock(&root->node_lock);
-		root->node = child;
-		spin_unlock(&root->node_lock);
+		tree_mod_log_set_root_pointer(root, child, 1);
+		rcu_assign_pointer(root->node, child);
 
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
@@ -1063,15 +1960,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		root_sub_used(root, mid->len);
 		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		/* once for the root ptr */
-		free_extent_buffer(mid);
+		free_extent_buffer_stale(mid);
 		return 0;
 	}
 	if (btrfs_header_nritems(mid) >
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	btrfs_header_nritems(mid);
-
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
 		btrfs_tree_lock(left);
@@ -1101,7 +1996,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		wret = push_node_left(trans, root, left, mid, 1);
 		if (wret < 0)
 			ret = wret;
-		btrfs_header_nritems(mid);
 	}
 
 	/*
@@ -1114,17 +2008,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (btrfs_header_nritems(right) == 0) {
 			clean_tree_block(trans, root, right);
 			btrfs_tree_unlock(right);
-			wret = del_ptr(trans, root, path, level + 1, pslot +
-				       1);
-			if (wret)
-				ret = wret;
+			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
 			btrfs_free_tree_block(trans, root, right, 0, 1);
-			free_extent_buffer(right);
+			free_extent_buffer_stale(right);
 			right = NULL;
 		} else {
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot + 1, 0);
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 		}
@@ -1139,7 +2032,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		 * otherwise we would have pulled some pointers from the
 		 * right
 		 */
-		BUG_ON(!left);
+		if (!left) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			goto enospc;
+		}
 		wret = balance_node_right(trans, root, mid, left);
 		if (wret < 0) {
 			ret = wret;
@@ -1155,17 +2052,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	if (btrfs_header_nritems(mid) == 0) {
 		clean_tree_block(trans, root, mid);
 		btrfs_tree_unlock(mid);
-		wret = del_ptr(trans, root, path, level + 1, pslot);
-		if (wret)
-			ret = wret;
+		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
 		btrfs_free_tree_block(trans, root, mid, 0, 1);
-		free_extent_buffer(mid);
+		free_extent_buffer_stale(mid);
 		mid = NULL;
 	} else {
 		/* update the parent key to reflect our changes */
 		struct btrfs_disk_key mid_key;
 		btrfs_node_key(mid, &mid_key, 0);
+		tree_mod_log_set_node_key(root->fs_info, parent,
+					  pslot, 0);
 		btrfs_set_node_key(parent, &mid_key, pslot);
 		btrfs_mark_buffer_dirty(parent);
 	}
@@ -1188,7 +2085,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		}
 	}
 	/* double check we haven't messed things up */
-	check_block(root, path, level);
 	if (orig_ptr !=
 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
@@ -1228,9 +2124,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	mid = path->nodes[level];
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	if (!parent)
 		return 1;
@@ -1263,6 +2160,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key disk_key;
 			orig_slot += left_nr;
 			btrfs_node_key(mid, &disk_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot);
 			btrfs_mark_buffer_dirty(parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
@@ -1314,6 +2213,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_node_key(right, &disk_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot + 1, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 
@@ -1350,6 +2251,7 @@ static void reada_for_search(struct btrfs_root *root,
 	u64 search;
 	u64 target;
 	u64 nread = 0;
+	u64 gen;
 	int direction = path->reada;
 	struct extent_buffer *eb;
 	u32 nr;
@@ -1376,6 +2278,7 @@ static void reada_for_search(struct btrfs_root *root,
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
+
 	while (1) {
 		if (direction < 0) {
 			if (nr == 0)
@@ -1394,8 +2297,8 @@ static void reada_for_search(struct btrfs_root *root,
 		search = btrfs_node_blockptr(node, nr);
 		if ((search <= target && target - search <= 65536) ||
 		    (search > target && search - target <= 65536)) {
-			readahead_tree_block(root, search, blocksize,
-				     btrfs_node_ptr_generation(node, nr));
+			gen = btrfs_node_ptr_generation(node, nr);
+			readahead_tree_block(root, search, blocksize, gen);
 			nread += blocksize;
 		}
 		nscan++;
@@ -1404,12 +2307,8 @@ static void reada_for_search(struct btrfs_root *root,
 	}
 }
 
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
-				      struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+				       struct btrfs_path *path, int level)
 {
 	int slot;
 	int nritems;
@@ -1418,12 +2317,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 	u64 gen;
 	u64 block1 = 0;
 	u64 block2 = 0;
-	int ret = 0;
 	int blocksize;
 
 	parent = path->nodes[level + 1];
 	if (!parent)
-		return 0;
+		return;
 
 	nritems = btrfs_header_nritems(parent);
 	slot = path->slots[level + 1];
@@ -1433,7 +2331,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 		block1 = btrfs_node_blockptr(parent, slot - 1);
 		gen = btrfs_node_ptr_generation(parent, slot - 1);
 		eb = btrfs_find_tree_block(root, block1, blocksize);
-		if (eb && btrfs_buffer_uptodate(eb, gen))
+		/*
+		 * if we get -eagain from btrfs_buffer_uptodate, we
+		 * don't want to return eagain here.  That will loop
+		 * forever
+		 */
+		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
 			block1 = 0;
 		free_extent_buffer(eb);
 	}
@@ -1441,32 +2344,15 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 		block2 = btrfs_node_blockptr(parent, slot + 1);
 		gen = btrfs_node_ptr_generation(parent, slot + 1);
 		eb = btrfs_find_tree_block(root, block2, blocksize);
-		if (eb && btrfs_buffer_uptodate(eb, gen))
+		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
 			block2 = 0;
 		free_extent_buffer(eb);
 	}
-	if (block1 || block2) {
-		ret = -EAGAIN;
-
-		/* release the whole path */
-		btrfs_release_path(root, path);
 
-		/* read the blocks */
-		if (block1)
-			readahead_tree_block(root, block1, blocksize, 0);
-		if (block2)
-			readahead_tree_block(root, block2, blocksize, 0);
-
-		if (block1) {
-			eb = read_tree_block(root, block1, blocksize, 0);
-			free_extent_buffer(eb);
-		}
-		if (block2) {
-			eb = read_tree_block(root, block2, blocksize, 0);
-			free_extent_buffer(eb);
-		}
-	}
-	return ret;
+	if (block1)
+		readahead_tree_block(root, block1, blocksize, 0);
+	if (block2)
+		readahead_tree_block(root, block2, blocksize, 0);
 }
 
 
@@ -1484,7 +2370,8 @@ static noinline int reada_for_balance(struct btrfs_root *root,
  * if lowest_unlock is 1, level 0 won't be unlocked
  */
 static noinline void unlock_up(struct btrfs_path *path, int level,
-			       int lowest_unlock)
+			       int lowest_unlock, int min_write_lock_level,
+			       int *write_lock_level)
 {
 	int i;
 	int skip_level = level;
@@ -1514,8 +2401,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 
 		t = path->nodes[i];
 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
-			btrfs_tree_unlock(t);
+			btrfs_tree_unlock_rw(t, path->locks[i]);
 			path->locks[i] = 0;
+			if (write_lock_level &&
+			    i > min_write_lock_level &&
+			    i <= *write_lock_level) {
+				*write_lock_level = i - 1;
+			}
 		}
 	}
 }
@@ -1541,7 +2433,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 			continue;
 		if (!path->locks[i])
 			continue;
-		btrfs_tree_unlock(path->nodes[i]);
+		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
 		path->locks[i] = 0;
 	}
 }
@@ -1558,7 +2450,7 @@ static int
 read_block_for_search(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct btrfs_path *p,
 		       struct extent_buffer **eb_ret, int level, int slot,
-		       struct btrfs_key *key)
+		       struct btrfs_key *key, u64 time_seq)
 {
 	u64 blocknr;
 	u64 gen;
@@ -1573,32 +2465,29 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 
 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
 	if (tmp) {
-		if (btrfs_buffer_uptodate(tmp, 0)) {
-			if (btrfs_buffer_uptodate(tmp, gen)) {
-				/*
-				 * we found an up to date block without
-				 * sleeping, return
-				 * right away
-				 */
-				*eb_ret = tmp;
-				return 0;
-			}
-			/* the pages were up to date, but we failed
-			 * the generation number check.  Do a full
-			 * read for the generation number that is correct.
-			 * We must do this without dropping locks so
-			 * we can trust our generation number
-			 */
-			free_extent_buffer(tmp);
-			tmp = read_tree_block(root, blocknr, blocksize, gen);
-			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-				*eb_ret = tmp;
-				return 0;
-			}
-			free_extent_buffer(tmp);
-			btrfs_release_path(NULL, p);
-			return -EIO;
+		/* first we do an atomic uptodate check */
+		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+			*eb_ret = tmp;
+			return 0;
+		}
+
+		/* the pages were up to date, but we failed
+		 * the generation number check.  Do a full
+		 * read for the generation number that is correct.
+		 * We must do this without dropping locks so
+		 * we can trust our generation number
+		 */
+		btrfs_set_path_blocking(p);
+
+		/* now we're allowed to do a blocking uptodate check */
+		ret = btrfs_read_buffer(tmp, gen);
+		if (!ret) {
+			*eb_ret = tmp;
+			return 0;
 		}
+		free_extent_buffer(tmp);
+		btrfs_release_path(p);
+		return -EIO;
 	}
 
 	/*
@@ -1615,7 +2504,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	if (p->reada)
 		reada_for_search(root, p, level, slot, key->objectid);
 
-	btrfs_release_path(NULL, p);
+	btrfs_release_path(p);
 
 	ret = -EAGAIN;
 	tmp = read_tree_block(root, blocknr, blocksize, 0);
@@ -1626,7 +2515,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 		 * and give up so that our caller doesn't loop forever
 		 * on our EAGAINs.
 		 */
-		if (!btrfs_buffer_uptodate(tmp, 0))
+		if (!btrfs_buffer_uptodate(tmp, 0, 0))
 			ret = -EIO;
 		free_extent_buffer(tmp);
 	}
@@ -1645,20 +2534,24 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 static int
 setup_nodes_for_search(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct btrfs_path *p,
-		       struct extent_buffer *b, int level, int ins_len)
+		       struct extent_buffer *b, int level, int ins_len,
+		       int *write_lock_level)
 {
 	int ret;
 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
 		int sret;
 
-		sret = reada_for_balance(root, p, level);
-		if (sret)
+		if (*write_lock_level < level + 1) {
+			*write_lock_level = level + 1;
+			btrfs_release_path(p);
 			goto again;
+		}
 
 		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
 		sret = split_node(trans, root, p, level);
-		btrfs_clear_path_blocking(p, NULL);
+		btrfs_clear_path_blocking(p, NULL, 0);
 
 		BUG_ON(sret > 0);
 		if (sret) {
@@ -1670,13 +2563,16 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
 		int sret;
 
-		sret = reada_for_balance(root, p, level);
-		if (sret)
+		if (*write_lock_level < level + 1) {
+			*write_lock_level = level + 1;
+			btrfs_release_path(p);
 			goto again;
+		}
 
 		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
 		sret = balance_level(trans, root, p, level);
-		btrfs_clear_path_blocking(p, NULL);
+		btrfs_clear_path_blocking(p, NULL, 0);
 
 		if (sret) {
 			ret = sret;
@@ -1684,7 +2580,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 		}
 		b = p->nodes[level];
 		if (!b) {
-			btrfs_release_path(NULL, p);
+			btrfs_release_path(p);
 			goto again;
 		}
 		BUG_ON(btrfs_header_nritems(b) == 1);
@@ -1697,6 +2593,83 @@ done:
 	return ret;
 }
 
+static void key_search_validate(struct extent_buffer *b,
+				struct btrfs_key *key,
+				int level)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+	struct btrfs_disk_key disk_key;
+
+	btrfs_cpu_key_to_disk(&disk_key, key);
+
+	if (level == 0)
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_leaf, items[0].key),
+		    sizeof(disk_key)));
+	else
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_node, ptrs[0].key),
+		    sizeof(disk_key)));
+#endif
+}
+
+static int key_search(struct extent_buffer *b, struct btrfs_key *key,
+		      int level, int *prev_cmp, int *slot)
+{
+	if (*prev_cmp != 0) {
+		*prev_cmp = bin_search(b, key, level, slot);
+		return *prev_cmp;
+	}
+
+	key_search_validate(b, key, level);
+	*slot = 0;
+
+	return 0;
+}
+
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+		u64 iobjectid, u64 ioff, u8 key_type,
+		struct btrfs_key *found_key)
+{
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_path *path;
+
+	key.type = key_type;
+	key.objectid = iobjectid;
+	key.offset = ioff;
+
+	if (found_path == NULL) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	} else
+		path = found_path;
+
+	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+	if ((ret < 0) || (found_key == NULL)) {
+		if (path != found_path)
+			btrfs_free_path(path);
+		return ret;
+	}
+
+	eb = path->nodes[0];
+	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+		ret = btrfs_next_leaf(fs_root, path);
+		if (ret)
+			return ret;
+		eb = path->nodes[0];
+	}
+
+	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+	if (found_key->type != key.type ||
+			found_key->objectid != key.objectid)
+		return 1;
+
+	return 0;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -1720,27 +2693,88 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int err;
 	int level;
 	int lowest_unlock = 1;
+	int root_lock;
+	/* everything at write_lock_level or lower must be write locked */
+	int write_lock_level = 0;
 	u8 lowest_level = 0;
+	int min_write_lock_level;
+	int prev_cmp;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
+	BUG_ON(!cow && ins_len);
 
-	if (ins_len < 0)
+	if (ins_len < 0) {
 		lowest_unlock = 2;
 
+		/* when we are removing items, we might have to go up to level
+		 * two as we update tree pointers  Make sure we keep write
+		 * for those levels as well
+		 */
+		write_lock_level = 2;
+	} else if (ins_len > 0) {
+		/*
+		 * for inserting items, make sure we have a write lock on
+		 * level 1 so we can update keys
+		 */
+		write_lock_level = 1;
+	}
+
+	if (!cow)
+		write_lock_level = -1;
+
+	if (cow && (p->keep_locks || p->lowest_level))
+		write_lock_level = BTRFS_MAX_LEVEL;
+
+	min_write_lock_level = write_lock_level;
+
 again:
+	prev_cmp = -1;
+	/*
+	 * we try very hard to do read locks on the root
+	 */
+	root_lock = BTRFS_READ_LOCK;
+	level = 0;
 	if (p->search_commit_root) {
+		/*
+		 * the commit roots are read only
+		 * so we always do read locks
+		 */
+		if (p->need_commit_sem)
+			down_read(&root->fs_info->commit_root_sem);
 		b = root->commit_root;
 		extent_buffer_get(b);
+		level = btrfs_header_level(b);
+		if (p->need_commit_sem)
+			up_read(&root->fs_info->commit_root_sem);
 		if (!p->skip_locking)
-			btrfs_tree_lock(b);
+			btrfs_tree_read_lock(b);
 	} else {
-		if (p->skip_locking)
+		if (p->skip_locking) {
 			b = btrfs_root_node(root);
-		else
-			b = btrfs_lock_root_node(root);
+			level = btrfs_header_level(b);
+		} else {
+			/* we don't know the level of the root node
+			 * until we actually have it read locked
+			 */
+			b = btrfs_read_lock_root_node(root);
+			level = btrfs_header_level(b);
+			if (level <= write_lock_level) {
+				/* whoops, must trade for write lock */
+				btrfs_tree_read_unlock(b);
+				free_extent_buffer(b);
+				b = btrfs_lock_root_node(root);
+				root_lock = BTRFS_WRITE_LOCK;
+
+				/* the level might have changed, check again */
+				level = btrfs_header_level(b);
+			}
+		}
 	}
+	p->nodes[level] = b;
+	if (!p->skip_locking)
+		p->locks[level] = root_lock;
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -1749,10 +2783,6 @@ again:
 		 * setup the path here so we can release it under lock
 		 * contention with the cow code
 		 */
-		p->nodes[level] = b;
-		if (!p->skip_locking)
-			p->locks[level] = 1;
-
 		if (cow) {
 			/*
 			 * if we don't really need to cow this block
@@ -1764,6 +2794,19 @@ again:
 
 			btrfs_set_path_blocking(p);
 
+			/*
+			 * must have write locks on this node and the
+			 * parent
+			 */
+			if (level > write_lock_level ||
+			    (level + 1 > write_lock_level &&
+			    level + 1 < BTRFS_MAX_LEVEL &&
+			    p->nodes[level + 1])) {
+				write_lock_level = level + 1;
+				btrfs_release_path(p);
+				goto again;
+			}
+
 			err = btrfs_cow_block(trans, root, b,
 					      p->nodes[level + 1],
 					      p->slots[level + 1], &b);
@@ -1773,16 +2816,8 @@ again:
 			}
 		}
 cow_done:
-		BUG_ON(!cow && ins_len);
-		if (level != btrfs_header_level(b))
-			WARN_ON(1);
-		level = btrfs_header_level(b);
-
 		p->nodes[level] = b;
-		if (!p->skip_locking)
-			p->locks[level] = 1;
-
-		btrfs_clear_path_blocking(p, NULL);
+		btrfs_clear_path_blocking(p, NULL, 0);
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -1790,21 +2825,21 @@ cow_done:
 		 * It is safe to drop the lock on our parent before we
 		 * go through the expensive btree search on b.
 		 *
-		 * If cow is true, then we might be changing slot zero,
-		 * which may require changing the parent.  So, we can't
-		 * drop the lock until after we know which slot we're
-		 * operating on.
+		 * If we're inserting or deleting (ins_len != 0), then we might
+		 * be changing slot zero, which may require changing the parent.
+		 * So, we can't drop the lock until after we know which slot
+		 * we're operating on.
 		 */
-		if (!cow)
-			btrfs_unlock_up_safe(p, level + 1);
+		if (!ins_len && !p->keep_locks) {
+			int u = level + 1;
 
-		ret = check_block(root, p, level);
-		if (ret) {
-			ret = -1;
-			goto done;
+			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+				p->locks[u] = 0;
+			}
 		}
 
-		ret = bin_search(b, key, level, &slot);
+		ret = key_search(b, key, level, &prev_cmp, &slot);
 
 		if (level != 0) {
 			int dec = 0;
@@ -1814,7 +2849,7 @@ cow_done:
 			}
 			p->slots[level] = slot;
 			err = setup_nodes_for_search(trans, root, p, b, level,
-						     ins_len);
+					     ins_len, &write_lock_level);
 			if (err == -EAGAIN)
 				goto again;
 			if (err) {
@@ -1824,7 +2859,21 @@ cow_done:
 			b = p->nodes[level];
 			slot = p->slots[level];
 
-			unlock_up(p, level, lowest_unlock);
+			/*
+			 * slot 0 is special, if we change the key
+			 * we have to update the parent pointer
+			 * which means we must have a write lock
+			 * on the parent
+			 */
+			if (slot == 0 && ins_len &&
+			    write_lock_level < level + 1) {
+				write_lock_level = level + 1;
+				btrfs_release_path(p);
+				goto again;
+			}
+
+			unlock_up(p, level, lowest_unlock,
+				  min_write_lock_level, &write_lock_level);
 
 			if (level == lowest_level) {
 				if (dec)
@@ -1833,7 +2882,7 @@ cow_done:
 			}
 
 			err = read_block_for_search(trans, root, p,
-						    &b, level, slot, key);
+						    &b, level, slot, key, 0);
 			if (err == -EAGAIN)
 				goto again;
 			if (err) {
@@ -1842,23 +2891,42 @@ cow_done:
 			}
 
 			if (!p->skip_locking) {
-				btrfs_clear_path_blocking(p, NULL);
-				err = btrfs_try_spin_lock(b);
-
-				if (!err) {
-					btrfs_set_path_blocking(p);
-					btrfs_tree_lock(b);
-					btrfs_clear_path_blocking(p, b);
+				level = btrfs_header_level(b);
+				if (level <= write_lock_level) {
+					err = btrfs_try_tree_write_lock(b);
+					if (!err) {
+						btrfs_set_path_blocking(p);
+						btrfs_tree_lock(b);
+						btrfs_clear_path_blocking(p, b,
+								  BTRFS_WRITE_LOCK);
+					}
+					p->locks[level] = BTRFS_WRITE_LOCK;
+				} else {
+					err = btrfs_try_tree_read_lock(b);
+					if (!err) {
+						btrfs_set_path_blocking(p);
+						btrfs_tree_read_lock(b);
+						btrfs_clear_path_blocking(p, b,
+								  BTRFS_READ_LOCK);
+					}
+					p->locks[level] = BTRFS_READ_LOCK;
 				}
+				p->nodes[level] = b;
 			}
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 &&
 			    btrfs_leaf_free_space(root, b) < ins_len) {
+				if (write_lock_level < 1) {
+					write_lock_level = 1;
+					btrfs_release_path(p);
+					goto again;
+				}
+
 				btrfs_set_path_blocking(p);
 				err = split_leaf(trans, root, key,
 						 p, ins_len, ret == 0);
-				btrfs_clear_path_blocking(p, NULL);
+				btrfs_clear_path_blocking(p, NULL, 0);
 
 				BUG_ON(err > 0);
 				if (err) {
@@ -1867,7 +2935,8 @@ cow_done:
 				}
 			}
 			if (!p->search_for_split)
-				unlock_up(p, level, lowest_unlock);
+				unlock_up(p, level, lowest_unlock,
+					  min_write_lock_level, &write_lock_level);
 			goto done;
 		}
 	}
@@ -1880,26 +2949,209 @@ done:
 	if (!p->leave_spinning)
 		btrfs_set_path_blocking(p);
 	if (ret < 0)
-		btrfs_release_path(root, p);
+		btrfs_release_path(p);
 	return ret;
 }
 
 /*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+			  struct btrfs_path *p, u64 time_seq)
+{
+	struct extent_buffer *b;
+	int slot;
+	int ret;
+	int err;
+	int level;
+	int lowest_unlock = 1;
+	u8 lowest_level = 0;
+	int prev_cmp = -1;
+
+	lowest_level = p->lowest_level;
+	WARN_ON(p->nodes[0] != NULL);
+
+	if (p->search_commit_root) {
+		BUG_ON(time_seq);
+		return btrfs_search_slot(NULL, root, key, p, 0, 0);
+	}
+
+again:
+	b = get_old_root(root, time_seq);
+	level = btrfs_header_level(b);
+	p->locks[level] = BTRFS_READ_LOCK;
+
+	while (b) {
+		level = btrfs_header_level(b);
+		p->nodes[level] = b;
+		btrfs_clear_path_blocking(p, NULL, 0);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 */
+		btrfs_unlock_up_safe(p, level + 1);
+
+		/*
+		 * Since we can unwind eb's we want to do a real search every
+		 * time.
+		 */
+		prev_cmp = -1;
+		ret = key_search(b, key, level, &prev_cmp, &slot);
+
+		if (level != 0) {
+			int dec = 0;
+			if (ret && slot > 0) {
+				dec = 1;
+				slot -= 1;
+			}
+			p->slots[level] = slot;
+			unlock_up(p, level, lowest_unlock, 0, NULL);
+
+			if (level == lowest_level) {
+				if (dec)
+					p->slots[level]++;
+				goto done;
+			}
+
+			err = read_block_for_search(NULL, root, p, &b, level,
+						    slot, key, time_seq);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+
+			level = btrfs_header_level(b);
+			err = btrfs_try_tree_read_lock(b);
+			if (!err) {
+				btrfs_set_path_blocking(p);
+				btrfs_tree_read_lock(b);
+				btrfs_clear_path_blocking(p, b,
+							  BTRFS_READ_LOCK);
+			}
+			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+			if (!b) {
+				ret = -ENOMEM;
+				goto done;
+			}
+			p->locks[level] = BTRFS_READ_LOCK;
+			p->nodes[level] = b;
+		} else {
+			p->slots[level] = slot;
+			unlock_up(p, level, lowest_unlock, 0, NULL);
+			goto done;
+		}
+	}
+	ret = 1;
+done:
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
+	if (ret < 0)
+		btrfs_release_path(p);
+
+	return ret;
+}
+
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any)
+{
+	int ret;
+	struct extent_buffer *leaf;
+
+again:
+	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+	if (ret <= 0)
+		return ret;
+	/*
+	 * a return value of 1 means the path is at the position where the
+	 * item should be inserted. Normally this is the next bigger item,
+	 * but in case the previous item is the last in a leaf, path points
+	 * to the first free slot in the previous leaf, i.e. at an invalid
+	 * item.
+	 */
+	leaf = p->nodes[0];
+
+	if (find_higher) {
+		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, p);
+			if (ret <= 0)
+				return ret;
+			if (!return_any)
+				return 1;
+			/*
+			 * no higher item found, return the next
+			 * lower instead
+			 */
+			return_any = 0;
+			find_higher = 0;
+			btrfs_release_path(p);
+			goto again;
+		}
+	} else {
+		if (p->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, p);
+			if (ret < 0)
+				return ret;
+			if (!ret) {
+				leaf = p->nodes[0];
+				if (p->slots[0] == btrfs_header_nritems(leaf))
+					p->slots[0]--;
+				return 0;
+			}
+			if (!return_any)
+				return 1;
+			/*
+			 * no lower item found, return the next
+			 * higher instead
+			 */
+			return_any = 0;
+			find_higher = 1;
+			btrfs_release_path(p);
+			goto again;
+		} else {
+			--p->slots[0];
+		}
+	}
+	return 0;
+}
+
+/*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
  * This is used after shifting pointers to the left, so it stops
  * fixing up pointers when a given leaf/node is not in slot 0 of the
  * higher levels
  *
- * If this fails to write a tree block, it returns -1, but continues
- * fixing up the blocks in ram so the tree is consistent.
  */
-static int fixup_low_keys(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct btrfs_path *path,
-			  struct btrfs_disk_key *key, int level)
+static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+			   struct btrfs_disk_key *key, int level)
 {
 	int i;
-	int ret = 0;
 	struct extent_buffer *t;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1907,12 +3159,12 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
 		if (!path->nodes[i])
 			break;
 		t = path->nodes[i];
+		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
-	return ret;
 }
 
 /*
@@ -1921,9 +3173,8 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *new_key)
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+			     struct btrfs_key *new_key)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *eb;
@@ -1933,21 +3184,18 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 	slot = path->slots[0];
 	if (slot > 0) {
 		btrfs_item_key(eb, &disk_key, slot - 1);
-		if (comp_keys(&disk_key, new_key) >= 0)
-			return -1;
+		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
 	}
 	if (slot < btrfs_header_nritems(eb) - 1) {
 		btrfs_item_key(eb, &disk_key, slot + 1);
-		if (comp_keys(&disk_key, new_key) <= 0)
-			return -1;
+		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(eb, &disk_key, slot);
 	btrfs_mark_buffer_dirty(eb);
 	if (slot == 0)
-		fixup_low_keys(trans, root, path, &disk_key, 1);
-	return 0;
+		fixup_low_keys(root, path, &disk_key, 1);
 }
 
 /*
@@ -1993,12 +3241,22 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	} else
 		push_items = min(src_nritems - 8, push_items);
 
+	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+				   push_items);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
 			   btrfs_node_key_ptr_offset(0),
 			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
+		/*
+		 * don't call tree_mod_log_eb_move here, key removal was already
+		 * fully logged by tree_mod_log_eb_copy above.
+		 */
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
 				      btrfs_node_key_ptr_offset(push_items),
 				      (src_nritems - push_items) *
@@ -2052,11 +3310,18 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	if (max_push < push_items)
 		push_items = max_push;
 
+	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
 				      btrfs_node_key_ptr_offset(0),
 				      (dst_nritems) *
 				      sizeof(struct btrfs_key_ptr));
 
+	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+				   src_nritems - push_items, push_items);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2113,13 +3378,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
-	write_extent_buffer(c, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(c),
+	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
-			    BTRFS_UUID_SIZE);
+			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
 
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
@@ -2130,10 +3393,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	btrfs_mark_buffer_dirty(c);
 
-	spin_lock(&root->node_lock);
 	old = root->node;
-	root->node = c;
-	spin_unlock(&root->node_lock);
+	tree_mod_log_set_root_pointer(root, c, 0);
+	rcu_assign_pointer(root->node, c);
 
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(old);
@@ -2141,7 +3403,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	add_root_to_dirty_list(root);
 	extent_buffer_get(c);
 	path->nodes[level] = c;
-	path->locks[level] = 1;
+	path->locks[level] = BTRFS_WRITE_LOCK;
 	path->slots[level] = 0;
 	return 0;
 }
@@ -2152,36 +3414,42 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
  *
  * slot and level indicate where you want the key to go, and
  * blocknr is the block the key points to.
- *
- * returns zero on success and < 0 on any error
  */
-static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, struct btrfs_disk_key
-		      *key, u64 bytenr, int slot, int level)
+static void insert_ptr(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *path,
+		       struct btrfs_disk_key *key, u64 bytenr,
+		       int slot, int level)
 {
 	struct extent_buffer *lower;
 	int nritems;
+	int ret;
 
 	BUG_ON(!path->nodes[level]);
 	btrfs_assert_tree_locked(path->nodes[level]);
 	lower = path->nodes[level];
 	nritems = btrfs_header_nritems(lower);
 	BUG_ON(slot > nritems);
-	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
-		BUG();
+	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
 	if (slot != nritems) {
+		if (level)
+			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+					     slot, nritems - slot);
 		memmove_extent_buffer(lower,
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      btrfs_node_key_ptr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
+	if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+					      MOD_LOG_KEY_ADD, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
 	btrfs_set_node_key(lower, key, slot);
 	btrfs_set_node_blockptr(lower, slot, bytenr);
 	WARN_ON(trans->transid == 0);
 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
 	btrfs_set_header_nritems(lower, nritems + 1);
 	btrfs_mark_buffer_dirty(lower);
-	return 0;
 }
 
 /*
@@ -2202,13 +3470,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key disk_key;
 	int mid;
 	int ret;
-	int wret;
 	u32 c_nritems;
 
 	c = path->nodes[level];
 	WARN_ON(btrfs_header_generation(c) != trans->transid);
 	if (c == root->node) {
-		/* trying to split the root, lets make a new one */
+		/*
+		 * trying to split the root, lets make a new one
+		 *
+		 * tree mod log: We don't log_removal old root in
+		 * insert_new_root, because that root buffer will be kept as a
+		 * normal node. We are going to log removal of half of the
+		 * elements below with tree_mod_log_eb_copy. We're holding a
+		 * tree lock on the buffer, which is why we cannot race with
+		 * other tree_mod_log users.
+		 */
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
@@ -2241,13 +3517,17 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
 	write_extent_buffer(split, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(split),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
-
+	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+				   mid, c_nritems - mid);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 	copy_extent_buffer(split, c,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(mid),
@@ -2259,11 +3539,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(c);
 	btrfs_mark_buffer_dirty(split);
 
-	wret = insert_ptr(trans, root, path, &disk_key, split->start,
-			  path->slots[level + 1] + 1,
-			  level + 1);
-	if (wret)
-		ret = wret;
+	insert_ptr(trans, root, path, &disk_key, split->start,
+		   path->slots[level + 1] + 1, level + 1);
 
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
@@ -2285,14 +3562,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
  */
 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
+	struct btrfs_item *start_item;
+	struct btrfs_item *end_item;
+	struct btrfs_map_token token;
 	int data_len;
 	int nritems = btrfs_header_nritems(l);
 	int end = min(nritems, start + nr) - 1;
 
 	if (!nr)
 		return 0;
-	data_len = btrfs_item_end_nr(l, start);
-	data_len = data_len - btrfs_item_offset_nr(l, end);
+	btrfs_init_map_token(&token);
+	start_item = btrfs_item_nr(start);
+	end_item = btrfs_item_nr(end);
+	data_len = btrfs_token_item_offset(l, start_item, &token) +
+		btrfs_token_item_size(l, start_item, &token);
+	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
 	data_len += sizeof(struct btrfs_item) * nr;
 	WARN_ON(data_len < 0);
 	return data_len;
@@ -2310,8 +3594,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	int ret;
 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
 	if (ret < 0) {
-		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
-		       "used %d nritems %d\n",
+		btrfs_crit(root->fs_info,
+			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
 		       leaf_space_used(leaf, 0, nritems), nritems);
 	}
@@ -2332,6 +3616,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *upper = path->nodes[1];
+	struct btrfs_map_token token;
 	struct btrfs_disk_key disk_key;
 	int slot;
 	u32 i;
@@ -2343,6 +3628,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	u32 data_end;
 	u32 this_item_size;
 
+	btrfs_init_map_token(&token);
+
 	if (empty)
 		nr = 0;
 	else
@@ -2354,7 +3641,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
-		item = btrfs_item_nr(left, i);
+		item = btrfs_item_nr(i);
 
 		if (!empty && push_items > 0) {
 			if (path->slots[0] > i)
@@ -2369,14 +3656,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 		if (path->slots[0] == i)
 			push_space += data_size;
 
-		if (!left->map_token) {
-			map_extent_buffer(left, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&left->map_token, &left->kaddr,
-					&left->map_start, &left->map_len,
-					KM_USER1);
-		}
-
 		this_item_size = btrfs_item_size(left, item);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
@@ -2387,16 +3666,11 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 			break;
 		i--;
 	}
-	if (left->map_token) {
-		unmap_extent_buffer(left, left->map_token, KM_USER1);
-		left->map_token = NULL;
-	}
 
 	if (push_items == 0)
 		goto out_unlock;
 
-	if (!empty && push_items == left_nritems)
-		WARN_ON(1);
+	WARN_ON(!empty && push_items == left_nritems);
 
 	/* push left to right */
 	right_nritems = btrfs_header_nritems(right);
@@ -2431,22 +3705,11 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(right, i);
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
-		push_space -= btrfs_item_size(right, item);
-		btrfs_set_item_offset(right, item, push_space);
+		item = btrfs_item_nr(i);
+		push_space -= btrfs_token_item_size(right, item, &token);
+		btrfs_set_token_item_offset(right, item, push_space, &token);
 	}
 
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
-	}
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(left, left_nritems);
 
@@ -2540,6 +3803,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left_nritems == 0)
 		goto out_unlock;
 
+	if (path->slots[0] == left_nritems && !empty) {
+		/* Key greater than all keys in the leaf, right neighbor has
+		 * enough room for it and we're not emptying our leaf to delete
+		 * it, therefore use right neighbor to insert the new item and
+		 * no need to touch/dirty our left leaft. */
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+		path->nodes[0] = right;
+		path->slots[0] = 0;
+		path->slots[1]++;
+		return 0;
+	}
+
 	return __push_leaf_right(trans, root, path, min_data_size, empty,
 				right, free_space, left_nritems, min_slot);
 out_unlock:
@@ -2572,9 +3848,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	u32 old_left_nritems;
 	u32 nr;
 	int ret = 0;
-	int wret;
 	u32 this_item_size;
 	u32 old_left_item_size;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	if (empty)
 		nr = min(right_nritems, max_slot);
@@ -2582,14 +3860,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		nr = min(right_nritems - 1, max_slot);
 
 	for (i = 0; i < nr; i++) {
-		item = btrfs_item_nr(right, i);
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
+		item = btrfs_item_nr(i);
 
 		if (!empty && push_items > 0) {
 			if (path->slots[0] < i)
@@ -2612,17 +3883,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		push_space += this_item_size + sizeof(*item);
 	}
 
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
-	}
-
 	if (push_items == 0) {
 		ret = 1;
 		goto out;
 	}
-	if (!empty && push_items == btrfs_header_nritems(right))
-		WARN_ON(1);
+	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
 
 	/* push data from right to left */
 	copy_extent_buffer(left, right,
@@ -2645,31 +3910,19 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
 
-		item = btrfs_item_nr(left, i);
-		if (!left->map_token) {
-			map_extent_buffer(left, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&left->map_token, &left->kaddr,
-					&left->map_start, &left->map_len,
-					KM_USER1);
-		}
+		item = btrfs_item_nr(i);
 
-		ioff = btrfs_item_offset(left, item);
-		btrfs_set_item_offset(left, item,
-		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+		ioff = btrfs_token_item_offset(left, item, &token);
+		btrfs_set_token_item_offset(left, item,
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+		      &token);
 	}
 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
-	if (left->map_token) {
-		unmap_extent_buffer(left, left->map_token, KM_USER1);
-		left->map_token = NULL;
-	}
 
 	/* fixup right node */
-	if (push_items > right_nritems) {
-		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+	if (push_items > right_nritems)
+		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
 		       right_nritems);
-		WARN_ON(1);
-	}
 
 	if (push_items < right_nritems) {
 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -2688,22 +3941,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(right, i);
-
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
+		item = btrfs_item_nr(i);
 
-		push_space = push_space - btrfs_item_size(right, item);
-		btrfs_set_item_offset(right, item, push_space);
-	}
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
+		push_space = push_space - btrfs_token_item_size(right,
+								item, &token);
+		btrfs_set_token_item_offset(right, item, push_space, &token);
 	}
 
 	btrfs_mark_buffer_dirty(left);
@@ -2713,9 +3955,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		clean_tree_block(trans, root, right);
 
 	btrfs_item_key(right, &disk_key, 0);
-	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
-	if (wret)
-		ret = wret;
+	fixup_low_keys(root, path, &disk_key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -2786,7 +4026,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			      path->nodes[1], slot - 1, &left);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
-		ret = 1;
+		if (ret == -ENOSPC)
+			ret = 1;
 		goto out;
 	}
 
@@ -2808,22 +4049,21 @@ out:
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
- *
- * returns 0 if all went well and < 0 on failure.
  */
-static noinline int copy_for_split(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct btrfs_path *path,
-			       struct extent_buffer *l,
-			       struct extent_buffer *right,
-			       int slot, int mid, int nritems)
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *l,
+				    struct extent_buffer *right,
+				    int slot, int mid, int nritems)
 {
 	int data_copy_size;
 	int rt_data_off;
 	int i;
-	int ret = 0;
-	int wret;
 	struct btrfs_disk_key disk_key;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	nritems = nritems - mid;
 	btrfs_set_header_nritems(right, nritems);
@@ -2842,33 +4082,18 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 		      btrfs_item_end_nr(l, mid);
 
 	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(right, i);
+		struct btrfs_item *item = btrfs_item_nr(i);
 		u32 ioff;
 
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
-
-		ioff = btrfs_item_offset(right, item);
-		btrfs_set_item_offset(right, item, ioff + rt_data_off);
-	}
-
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
+		ioff = btrfs_token_item_offset(right, item, &token);
+		btrfs_set_token_item_offset(right, item,
+					    ioff + rt_data_off, &token);
 	}
 
 	btrfs_set_header_nritems(l, mid);
-	ret = 0;
 	btrfs_item_key(right, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key, right->start,
-			  path->slots[1] + 1, 1);
-	if (wret)
-		ret = wret;
+	insert_ptr(trans, root, path, &disk_key, right->start,
+		   path->slots[1] + 1, 1);
 
 	btrfs_mark_buffer_dirty(right);
 	btrfs_mark_buffer_dirty(l);
@@ -2886,8 +4111,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	}
 
 	BUG_ON(path->slots[0] < 0);
-
-	return ret;
 }
 
 /*
@@ -2909,14 +4132,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
 	int progress = 0;
 	int slot;
 	u32 nritems;
+	int space_needed = data_size;
 
 	slot = path->slots[0];
+	if (slot < btrfs_header_nritems(path->nodes[0]))
+		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
 
 	/*
 	 * try to push all the items after our slot into the
 	 * right leaf
 	 */
-	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
 	if (ret < 0)
 		return ret;
 
@@ -2936,7 +4162,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
 
 	/* try to push all the items before our slot into the next leaf */
 	slot = path->slots[0];
-	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
 	if (ret < 0)
 		return ret;
 
@@ -2979,14 +4205,19 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		return -EOVERFLOW;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size) {
-		wret = push_leaf_right(trans, root, path, data_size,
-				       data_size, 0, 0);
+	if (data_size && path->nodes[1]) {
+		int space_needed = data_size;
+
+		if (slot < btrfs_header_nritems(l))
+			space_needed -= btrfs_leaf_free_space(root, l);
+
+		wret = push_leaf_right(trans, root, path, space_needed,
+				       space_needed, 0, 0);
 		if (wret < 0)
 			return wret;
 		if (wret) {
-			wret = push_leaf_left(trans, root, path, data_size,
-					      data_size, 0, (u32)-1);
+			wret = push_leaf_left(trans, root, path, space_needed,
+					      space_needed, 0, (u32)-1);
 			if (wret < 0)
 				return wret;
 		}
@@ -3040,7 +4271,7 @@ again:
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 					if (data_size && !tried_avoid_double)
 						goto push_for_double;
-					split = 2 ;
+					split = 2;
 				}
 			}
 		}
@@ -3066,22 +4297,17 @@ again:
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(right),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
 	if (split == 0) {
 		if (mid <= slot) {
 			btrfs_set_header_nritems(right, 0);
-			wret = insert_ptr(trans, root, path,
-					  &disk_key, right->start,
-					  path->slots[1] + 1, 1);
-			if (wret)
-				ret = wret;
-
+			insert_ptr(trans, root, path, &disk_key, right->start,
+				   path->slots[1] + 1, 1);
 			btrfs_tree_unlock(path->nodes[0]);
 			free_extent_buffer(path->nodes[0]);
 			path->nodes[0] = right;
@@ -3089,29 +4315,20 @@ again:
 			path->slots[1] += 1;
 		} else {
 			btrfs_set_header_nritems(right, 0);
-			wret = insert_ptr(trans, root, path,
-					  &disk_key,
-					  right->start,
+			insert_ptr(trans, root, path, &disk_key, right->start,
 					  path->slots[1], 1);
-			if (wret)
-				ret = wret;
 			btrfs_tree_unlock(path->nodes[0]);
 			free_extent_buffer(path->nodes[0]);
 			path->nodes[0] = right;
 			path->slots[0] = 0;
-			if (path->slots[1] == 0) {
-				wret = fixup_low_keys(trans, root,
-						path, &disk_key, 1);
-				if (wret)
-					ret = wret;
-			}
+			if (path->slots[1] == 0)
+				fixup_low_keys(root, path, &disk_key, 1);
 		}
 		btrfs_mark_buffer_dirty(right);
 		return ret;
 	}
 
-	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
-	BUG_ON(ret);
+	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 
 	if (split == 2) {
 		BUG_ON(num_doubles != 0);
@@ -3119,7 +4336,7 @@ again:
 		goto again;
 	}
 
-	return ret;
+	return 0;
 
 push_for_double:
 	push_for_double_split(trans, root, path, data_size);
@@ -3155,7 +4372,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 				    struct btrfs_file_extent_item);
 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	path->keep_locks = 1;
 	path->search_for_split = 1;
@@ -3215,7 +4432,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_path_blocking(path);
 
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	orig_offset = btrfs_item_offset(leaf, item);
 	item_size = btrfs_item_size(leaf, item);
 
@@ -3238,7 +4455,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(leaf, &disk_key, slot);
 
-	new_item = btrfs_item_nr(leaf, slot);
+	new_item = btrfs_item_nr(slot);
 
 	btrfs_set_item_offset(leaf, new_item, orig_offset);
 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
@@ -3321,11 +4538,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 		return ret;
 
 	path->slots[0]++;
-	ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
-				     item_size, item_size +
-				     sizeof(struct btrfs_item), 1);
-	BUG_ON(ret);
-
+	setup_items_for_insert(root, path, new_key, &item_size,
+			       item_size, item_size +
+			       sizeof(struct btrfs_item), 1);
 	leaf = path->nodes[0];
 	memcpy_extent_buffer(leaf,
 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -3340,12 +4555,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
  * off the end of the item or if we shift the item to chop bytes off
  * the front.
  */
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+			 u32 new_size, int from_end)
 {
-	int ret = 0;
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
@@ -3355,13 +4567,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	unsigned int old_size;
 	unsigned int size_diff;
 	int i;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
 	old_size = btrfs_item_size_nr(leaf, slot);
 	if (old_size == new_size)
-		return 0;
+		return;
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
@@ -3379,23 +4594,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(leaf, i);
-
-		if (!leaf->map_token) {
-			map_extent_buffer(leaf, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&leaf->map_token, &leaf->kaddr,
-					&leaf->map_start, &leaf->map_len,
-					KM_USER1);
-		}
+		item = btrfs_item_nr(i);
 
-		ioff = btrfs_item_offset(leaf, item);
-		btrfs_set_item_offset(leaf, item, ioff + size_diff);
-	}
-
-	if (leaf->map_token) {
-		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-		leaf->map_token = NULL;
+		ioff = btrfs_token_item_offset(leaf, item, &token);
+		btrfs_set_token_item_offset(leaf, item,
+					    ioff + size_diff, &token);
 	}
 
 	/* shift the data */
@@ -3436,29 +4639,25 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
 		btrfs_set_item_key(leaf, &disk_key, slot);
 		if (slot == 0)
-			fixup_low_keys(trans, root, path, &disk_key, 1);
+			fixup_low_keys(root, path, &disk_key, 1);
 	}
 
-	item = btrfs_item_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
 	btrfs_set_item_size(leaf, item, new_size);
 	btrfs_mark_buffer_dirty(leaf);
 
-	ret = 0;
 	if (btrfs_leaf_free_space(root, leaf) < 0) {
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return ret;
 }
 
 /*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
  */
-int btrfs_extend_item(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, struct btrfs_path *path,
-		      u32 data_size)
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+		       u32 data_size)
 {
-	int ret = 0;
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
@@ -3467,6 +4666,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	unsigned int old_data;
 	unsigned int old_size;
 	int i;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 
@@ -3483,7 +4685,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
 		btrfs_print_leaf(root, leaf);
-		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
 		       slot, nritems);
 		BUG_ON(1);
 	}
@@ -3494,22 +4696,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(leaf, i);
-
-		if (!leaf->map_token) {
-			map_extent_buffer(leaf, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&leaf->map_token, &leaf->kaddr,
-					&leaf->map_start, &leaf->map_len,
-					KM_USER1);
-		}
-		ioff = btrfs_item_offset(leaf, item);
-		btrfs_set_item_offset(leaf, item, ioff - data_size);
-	}
+		item = btrfs_item_nr(i);
 
-	if (leaf->map_token) {
-		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-		leaf->map_token = NULL;
+		ioff = btrfs_token_item_offset(leaf, item, &token);
+		btrfs_set_token_item_offset(leaf, item,
+					    ioff - data_size, &token);
 	}
 
 	/* shift the data */
@@ -3519,168 +4710,14 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 
 	data_end = old_data;
 	old_size = btrfs_item_size_nr(leaf, slot);
-	item = btrfs_item_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
 	btrfs_set_item_size(leaf, item, old_size + data_size);
 	btrfs_mark_buffer_dirty(leaf);
 
-	ret = 0;
 	if (btrfs_leaf_free_space(root, leaf) < 0) {
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return ret;
-}
-
-/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_item *item;
-	int ret = 0;
-	int slot;
-	int i;
-	u32 nritems;
-	u32 total_data = 0;
-	u32 total_size = 0;
-	unsigned int data_end;
-	struct btrfs_disk_key disk_key;
-	struct btrfs_key found_key;
-
-	for (i = 0; i < nr; i++) {
-		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
-		    BTRFS_LEAF_DATA_SIZE(root)) {
-			break;
-			nr = i;
-		}
-		total_data += data_size[i];
-		total_size += data_size[i] + sizeof(struct btrfs_item);
-	}
-	BUG_ON(nr == 0);
-
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	leaf = path->nodes[0];
-
-	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(root, leaf);
-
-	if (btrfs_leaf_free_space(root, leaf) < total_size) {
-		for (i = nr; i >= 0; i--) {
-			total_data -= data_size[i];
-			total_size -= data_size[i] + sizeof(struct btrfs_item);
-			if (total_size < btrfs_leaf_free_space(root, leaf))
-				break;
-		}
-		nr = i;
-	}
-
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
-	if (slot != nritems) {
-		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
-		item = btrfs_item_nr(leaf, slot);
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-		/* figure out how many keys we can insert in here */
-		total_data = data_size[0];
-		for (i = 1; i < nr; i++) {
-			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
-				break;
-			total_data += data_size[i];
-		}
-		nr = i;
-
-		if (old_data < data_end) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
-			       slot, old_data, data_end);
-			BUG_ON(1);
-		}
-		/*
-		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
-		 */
-		/* first correct the data pointers */
-		WARN_ON(leaf->map_token);
-		for (i = slot; i < nritems; i++) {
-			u32 ioff;
-
-			item = btrfs_item_nr(leaf, i);
-			if (!leaf->map_token) {
-				map_extent_buffer(leaf, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&leaf->map_token, &leaf->kaddr,
-					&leaf->map_start, &leaf->map_len,
-					KM_USER1);
-			}
-
-			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff - total_data);
-		}
-		if (leaf->map_token) {
-			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-			leaf->map_token = NULL;
-		}
-
-		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
-			      btrfs_item_nr_offset(slot),
-			      (nritems - slot) * sizeof(struct btrfs_item));
-
-		/* shift the data */
-		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-			      data_end - total_data, btrfs_leaf_data(leaf) +
-			      data_end, old_data - data_end);
-		data_end = old_data;
-	} else {
-		/*
-		 * this sucks but it has to be done, if we are inserting at
-		 * the end of the leaf only insert 1 of the items, since we
-		 * have no way of knowing whats on the next leaf and we'd have
-		 * to drop our current locks to figure it out
-		 */
-		nr = 1;
-	}
-
-	/* setup the item for the new data */
-	for (i = 0; i < nr; i++) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
-		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
-		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
-		data_end -= data_size[i];
-		btrfs_set_item_size(leaf, item, data_size[i]);
-	}
-	btrfs_set_header_nritems(leaf, nritems + nr);
-	btrfs_mark_buffer_dirty(leaf);
-
-	ret = 0;
-	if (slot == 0) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
-	}
-
-	if (btrfs_leaf_free_space(root, leaf) < 0) {
-		btrfs_print_leaf(root, leaf);
-		BUG();
-	}
-out:
-	if (!ret)
-		ret = nr;
-	return ret;
 }
 
 /*
@@ -3688,20 +4725,20 @@ out:
  * to save stack depth by doing the bulk of the work in a function
  * that doesn't call btrfs_search_slot
  */
-static noinline_for_stack int
-setup_items_for_insert(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, struct btrfs_path *path,
-		      struct btrfs_key *cpu_key, u32 *data_size,
-		      u32 total_data, u32 total_size, int nr)
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    u32 total_data, u32 total_size, int nr)
 {
 	struct btrfs_item *item;
 	int i;
 	u32 nritems;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
-	int ret;
 	struct extent_buffer *leaf;
 	int slot;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	slot = path->slots[0];
@@ -3711,7 +4748,7 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 
 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
 		btrfs_print_leaf(root, leaf);
-		printk(KERN_CRIT "not enough freespace need %u have %d\n",
+		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
 		       total_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
@@ -3721,7 +4758,7 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3729,27 +4766,14 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
 		 */
 		/* first correct the data pointers */
-		WARN_ON(leaf->map_token);
 		for (i = slot; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(leaf, i);
-			if (!leaf->map_token) {
-				map_extent_buffer(leaf, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&leaf->map_token, &leaf->kaddr,
-					&leaf->map_start, &leaf->map_len,
-					KM_USER1);
-			}
-
-			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff - total_data);
+			item = btrfs_item_nr( i);
+			ioff = btrfs_token_item_offset(leaf, item, &token);
+			btrfs_set_token_item_offset(leaf, item,
+						    ioff - total_data, &token);
 		}
-		if (leaf->map_token) {
-			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-			leaf->map_token = NULL;
-		}
-
 		/* shift the items */
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
 			      btrfs_item_nr_offset(slot),
@@ -3766,19 +4790,18 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 	for (i = 0; i < nr; i++) {
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
-		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		item = btrfs_item_nr(slot + i);
+		btrfs_set_token_item_offset(leaf, item,
+					    data_end - data_size[i], &token);
 		data_end -= data_size[i];
-		btrfs_set_item_size(leaf, item, data_size[i]);
+		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + nr);
 
-	ret = 0;
 	if (slot == 0) {
-		struct btrfs_disk_key disk_key;
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+		fixup_low_keys(root, path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
 	btrfs_mark_buffer_dirty(leaf);
@@ -3787,7 +4810,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return ret;
 }
 
 /*
@@ -3814,16 +4836,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	if (ret == 0)
 		return -EEXIST;
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
-	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+	setup_items_for_insert(root, path, cpu_key, data_size,
 			       total_data, total_size, nr);
-
-out:
-	return ret;
+	return 0;
 }
 
 /*
@@ -3840,7 +4860,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	unsigned long ptr;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (!ret) {
 		leaf = path->nodes[0];
@@ -3858,22 +4879,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
  * the tree should have been previously balanced so the deletion does not
  * empty a node.
  */
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot)
 {
 	struct extent_buffer *parent = path->nodes[level];
 	u32 nritems;
-	int ret = 0;
-	int wret;
+	int ret;
 
 	nritems = btrfs_header_nritems(parent);
 	if (slot != nritems - 1) {
+		if (level)
+			tree_mod_log_eb_move(root->fs_info, parent, slot,
+					     slot + 1, nritems - slot - 1);
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      sizeof(struct btrfs_key_ptr) *
 			      (nritems - slot - 1));
+	} else if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		BUG_ON(ret < 0);
 	}
+
 	nritems--;
 	btrfs_set_header_nritems(parent, nritems);
 	if (nritems == 0 && parent == root->node) {
@@ -3884,12 +4912,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		struct btrfs_disk_key disk_key;
 
 		btrfs_node_key(parent, &disk_key, 0);
-		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
-		if (wret)
-			ret = wret;
+		fixup_low_keys(root, path, &disk_key, level + 1);
 	}
 	btrfs_mark_buffer_dirty(parent);
-	return ret;
 }
 
 /*
@@ -3902,17 +4927,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * The path must have already been setup for deleting the leaf, including
  * all the proper balancing.  path->nodes[1] must be locked.
  */
-static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path,
-				   struct extent_buffer *leaf)
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *leaf)
 {
-	int ret;
-
 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-	ret = del_ptr(trans, root, path, 1, path->slots[1]);
-	if (ret)
-		return ret;
+	del_ptr(root, path, 1, path->slots[1]);
 
 	/*
 	 * btrfs_free_extent is expensive, we want to make sure we
@@ -3922,8 +4943,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
 	root_sub_used(root, leaf->len);
 
+	extent_buffer_get(leaf);
 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
-	return 0;
+	free_extent_buffer_stale(leaf);
 }
 /*
  * delete the item at the leaf level in path.  If that empties
@@ -3940,6 +4962,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int wret;
 	int i;
 	u32 nritems;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -3960,21 +4985,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(leaf, i);
-			if (!leaf->map_token) {
-				map_extent_buffer(leaf, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&leaf->map_token, &leaf->kaddr,
-					&leaf->map_start, &leaf->map_len,
-					KM_USER1);
-			}
-			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff + dsize);
-		}
-
-		if (leaf->map_token) {
-			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-			leaf->map_token = NULL;
+			item = btrfs_item_nr(i);
+			ioff = btrfs_token_item_offset(leaf, item, &token);
+			btrfs_set_token_item_offset(leaf, item,
+						    ioff + dsize, &token);
 		}
 
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -3992,8 +5006,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		} else {
 			btrfs_set_path_blocking(path);
 			clean_tree_block(trans, root, leaf);
-			ret = btrfs_del_leaf(trans, root, path, leaf);
-			BUG_ON(ret);
+			btrfs_del_leaf(trans, root, path, leaf);
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, nritems);
@@ -4001,10 +5014,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_item_key(leaf, &disk_key, 0);
-			wret = fixup_low_keys(trans, root, path,
-					      &disk_key, 1);
-			if (wret)
-				ret = wret;
+			fixup_low_keys(root, path, &disk_key, 1);
 		}
 
 		/* delete the leaf if it is mostly empty */
@@ -4032,9 +5042,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path, leaf);
-				BUG_ON(ret);
+				btrfs_del_leaf(trans, root, path, leaf);
 				free_extent_buffer(leaf);
+				ret = 0;
 			} else {
 				/* if we're still in the path, make sure
 				 * we're dirty.  Otherwise, one of the
@@ -4068,30 +5078,44 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-	if (key.offset > 0)
+	if (key.offset > 0) {
 		key.offset--;
-	else if (key.type > 0)
+	} else if (key.type > 0) {
 		key.type--;
-	else if (key.objectid > 0)
+		key.offset = (u64)-1;
+	} else if (key.objectid > 0) {
 		key.objectid--;
-	else
+		key.type = (u8)-1;
+		key.offset = (u64)-1;
+	} else {
 		return 1;
+	}
 
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
 	btrfs_item_key(path->nodes[0], &found_key, 0);
 	ret = comp_keys(&found_key, &key);
-	if (ret < 0)
+	/*
+	 * We might have had an item with the previous key in the tree right
+	 * before we released our path. And after we released our path, that
+	 * item might have been pushed to the first slot (0) of the leaf we
+	 * were holding due to a tree balance. Alternatively, an item with the
+	 * previous key can exist as the only element of a leaf (big fat item).
+	 * Therefore account for these 2 cases, so that our callers (like
+	 * btrfs_previous_item) don't miss an existing item with a key matching
+	 * the previous key we computed above.
+	 */
+	if (ret <= 0)
 		return 0;
 	return 1;
 }
 
 /*
  * A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
- * transaction id.  This is used by the btree defrag code, and tree logging
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
  *
  * This does not cow, but it does stuff the starting key it finds back
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4111,8 +5135,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-			 struct btrfs_key *max_key,
-			 struct btrfs_path *path, int cache_only,
+			 struct btrfs_path *path,
 			 u64 min_trans)
 {
 	struct extent_buffer *cur;
@@ -4125,11 +5148,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 
 	WARN_ON(!path->keep_locks);
 again:
-	cur = btrfs_lock_root_node(root);
+	cur = btrfs_read_lock_root_node(root);
 	level = btrfs_header_level(cur);
 	WARN_ON(path->nodes[level]);
 	path->nodes[level] = cur;
-	path->locks[level] = 1;
+	path->locks[level] = BTRFS_READ_LOCK;
 
 	if (btrfs_header_generation(cur) < min_trans) {
 		ret = 1;
@@ -4152,43 +5175,18 @@ again:
 		if (sret && slot > 0)
 			slot--;
 		/*
-		 * check this node pointer against the cache_only and
-		 * min_trans parameters.  If it isn't in cache or is too
-		 * old, skip to the next one.
+		 * check this node pointer against the min_trans parameters.
+		 * If it is too old, old, skip to the next one.
 		 */
 		while (slot < nritems) {
-			u64 blockptr;
 			u64 gen;
-			struct extent_buffer *tmp;
-			struct btrfs_disk_key disk_key;
 
-			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
 			if (gen < min_trans) {
 				slot++;
 				continue;
 			}
-			if (!cache_only)
-				break;
-
-			if (max_key) {
-				btrfs_node_key(cur, &disk_key, slot);
-				if (comp_keys(&disk_key, max_key) >= 0) {
-					ret = 1;
-					goto out;
-				}
-			}
-
-			tmp = btrfs_find_tree_block(root, blockptr,
-					    btrfs_level_size(root, level - 1));
-
-			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-				free_extent_buffer(tmp);
-				break;
-			}
-			if (tmp)
-				free_extent_buffer(tmp);
-			slot++;
+			break;
 		}
 find_next_key:
 		/*
@@ -4199,9 +5197,9 @@ find_next_key:
 			path->slots[level] = slot;
 			btrfs_set_path_blocking(path);
 			sret = btrfs_find_next_key(root, path, min_key, level,
-						  cache_only, min_trans);
+						  min_trans);
 			if (sret == 0) {
-				btrfs_release_path(root, path);
+				btrfs_release_path(path);
 				goto again;
 			} else {
 				goto out;
@@ -4212,18 +5210,19 @@ find_next_key:
 		path->slots[level] = slot;
 		if (level == path->lowest_level) {
 			ret = 0;
-			unlock_up(path, level, 1);
+			unlock_up(path, level, 1, 0, NULL);
 			goto out;
 		}
 		btrfs_set_path_blocking(path);
 		cur = read_node_slot(root, cur, slot);
+		BUG_ON(!cur); /* -ENOMEM */
 
-		btrfs_tree_lock(cur);
+		btrfs_tree_read_lock(cur);
 
-		path->locks[level - 1] = 1;
+		path->locks[level - 1] = BTRFS_READ_LOCK;
 		path->nodes[level - 1] = cur;
-		unlock_up(path, level, 1);
-		btrfs_clear_path_blocking(path, NULL);
+		unlock_up(path, level, 1, 0, NULL);
+		btrfs_clear_path_blocking(path, NULL, 0);
 	}
 out:
 	if (ret == 0)
@@ -4232,11 +5231,362 @@ out:
 	return ret;
 }
 
+static void tree_move_down(struct btrfs_root *root,
+			   struct btrfs_path *path,
+			   int *level, int root_level)
+{
+	BUG_ON(*level == 0);
+	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+					path->slots[*level]);
+	path->slots[*level - 1] = 0;
+	(*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    int *level, int root_level)
+{
+	int ret = 0;
+	int nritems;
+	nritems = btrfs_header_nritems(path->nodes[*level]);
+
+	path->slots[*level]++;
+
+	while (path->slots[*level] >= nritems) {
+		if (*level == root_level)
+			return -1;
+
+		/* move upnext */
+		path->slots[*level] = 0;
+		free_extent_buffer(path->nodes[*level]);
+		path->nodes[*level] = NULL;
+		(*level)++;
+		path->slots[*level]++;
+
+		nritems = btrfs_header_nritems(path->nodes[*level]);
+		ret = 1;
+	}
+	return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+			struct btrfs_path *path,
+			int *level, int root_level,
+			int allow_down,
+			struct btrfs_key *key)
+{
+	int ret;
+
+	if (*level == 0 || !allow_down) {
+		ret = tree_move_next_or_upnext(root, path, level, root_level);
+	} else {
+		tree_move_down(root, path, level, root_level);
+		ret = 0;
+	}
+	if (ret >= 0) {
+		if (*level == 0)
+			btrfs_item_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+		else
+			btrfs_node_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+	}
+	return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+			     struct btrfs_path *left_path,
+			     struct btrfs_path *right_path,
+			     char *tmp_buf)
+{
+	int cmp;
+	int len1, len2;
+	unsigned long off1, off2;
+
+	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+	if (len1 != len2)
+		return 1;
+
+	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+				right_path->slots[0]);
+
+	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+	if (cmp)
+		return 1;
+	return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t changed_cb, void *ctx)
+{
+	int ret;
+	int cmp;
+	struct btrfs_path *left_path = NULL;
+	struct btrfs_path *right_path = NULL;
+	struct btrfs_key left_key;
+	struct btrfs_key right_key;
+	char *tmp_buf = NULL;
+	int left_root_level;
+	int right_root_level;
+	int left_level;
+	int right_level;
+	int left_end_reached;
+	int right_end_reached;
+	int advance_left;
+	int advance_right;
+	u64 left_blockptr;
+	u64 right_blockptr;
+	u64 left_gen;
+	u64 right_gen;
+
+	left_path = btrfs_alloc_path();
+	if (!left_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	right_path = btrfs_alloc_path();
+	if (!right_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+	if (!tmp_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	left_path->search_commit_root = 1;
+	left_path->skip_locking = 1;
+	right_path->search_commit_root = 1;
+	right_path->skip_locking = 1;
+
+	/*
+	 * Strategy: Go to the first items of both trees. Then do
+	 *
+	 * If both trees are at level 0
+	 *   Compare keys of current items
+	 *     If left < right treat left item as new, advance left tree
+	 *       and repeat
+	 *     If left > right treat right item as deleted, advance right tree
+	 *       and repeat
+	 *     If left == right do deep compare of items, treat as changed if
+	 *       needed, advance both trees and repeat
+	 * If both trees are at the same level but not at level 0
+	 *   Compare keys of current nodes/leafs
+	 *     If left < right advance left tree and repeat
+	 *     If left > right advance right tree and repeat
+	 *     If left == right compare blockptrs of the next nodes/leafs
+	 *       If they match advance both trees but stay at the same level
+	 *         and repeat
+	 *       If they don't match advance both trees while allowing to go
+	 *         deeper and repeat
+	 * If tree levels are different
+	 *   Advance the tree that needs it and repeat
+	 *
+	 * Advancing a tree means:
+	 *   If we are at level 0, try to go to the next slot. If that's not
+	 *   possible, go one level up and repeat. Stop when we found a level
+	 *   where we could go to the next slot. We may at this point be on a
+	 *   node or a leaf.
+	 *
+	 *   If we are not at level 0 and not on shared tree blocks, go one
+	 *   level deeper.
+	 *
+	 *   If we are not at level 0 and on shared tree blocks, go one slot to
+	 *   the right if possible or go up and right.
+	 */
+
+	down_read(&left_root->fs_info->commit_root_sem);
+	left_level = btrfs_header_level(left_root->commit_root);
+	left_root_level = left_level;
+	left_path->nodes[left_level] = left_root->commit_root;
+	extent_buffer_get(left_path->nodes[left_level]);
+
+	right_level = btrfs_header_level(right_root->commit_root);
+	right_root_level = right_level;
+	right_path->nodes[right_level] = right_root->commit_root;
+	extent_buffer_get(right_path->nodes[right_level]);
+	up_read(&left_root->fs_info->commit_root_sem);
+
+	if (left_level == 0)
+		btrfs_item_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	else
+		btrfs_node_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	if (right_level == 0)
+		btrfs_item_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+	else
+		btrfs_node_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+
+	left_end_reached = right_end_reached = 0;
+	advance_left = advance_right = 0;
+
+	while (1) {
+		if (advance_left && !left_end_reached) {
+			ret = tree_advance(left_root, left_path, &left_level,
+					left_root_level,
+					advance_left != ADVANCE_ONLY_NEXT,
+					&left_key);
+			if (ret < 0)
+				left_end_reached = ADVANCE;
+			advance_left = 0;
+		}
+		if (advance_right && !right_end_reached) {
+			ret = tree_advance(right_root, right_path, &right_level,
+					right_root_level,
+					advance_right != ADVANCE_ONLY_NEXT,
+					&right_key);
+			if (ret < 0)
+				right_end_reached = ADVANCE;
+			advance_right = 0;
+		}
+
+		if (left_end_reached && right_end_reached) {
+			ret = 0;
+			goto out;
+		} else if (left_end_reached) {
+			if (right_level == 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_right = ADVANCE;
+			continue;
+		} else if (right_end_reached) {
+			if (left_level == 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_left = ADVANCE;
+			continue;
+		}
+
+		if (left_level == 0 && right_level == 0) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_right = ADVANCE;
+			} else {
+				enum btrfs_compare_tree_result cmp;
+
+				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+				ret = tree_compare_item(left_root, left_path,
+						right_path, tmp_buf);
+				if (ret)
+					cmp = BTRFS_COMPARE_TREE_CHANGED;
+				else
+					cmp = BTRFS_COMPARE_TREE_SAME;
+				ret = changed_cb(left_root, right_root,
+						 left_path, right_path,
+						 &left_key, cmp, ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+				advance_right = ADVANCE;
+			}
+		} else if (left_level == right_level) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				advance_right = ADVANCE;
+			} else {
+				left_blockptr = btrfs_node_blockptr(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_blockptr = btrfs_node_blockptr(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				left_gen = btrfs_node_ptr_generation(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_gen = btrfs_node_ptr_generation(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				if (left_blockptr == right_blockptr &&
+				    left_gen == right_gen) {
+					/*
+					 * As we're on a shared block, don't
+					 * allow to go deeper.
+					 */
+					advance_left = ADVANCE_ONLY_NEXT;
+					advance_right = ADVANCE_ONLY_NEXT;
+				} else {
+					advance_left = ADVANCE;
+					advance_right = ADVANCE;
+				}
+			}
+		} else if (left_level < right_level) {
+			advance_right = ADVANCE;
+		} else {
+			advance_left = ADVANCE;
+		}
+	}
+
+out:
+	btrfs_free_path(left_path);
+	btrfs_free_path(right_path);
+	kfree(tmp_buf);
+	return ret;
+}
+
 /*
  * this is similar to btrfs_next_leaf, but does not try to preserve
  * and fixup the path.  It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
- * parameters.
+ * tree based on the current path and the min_trans parameters.
  *
  * 0 is returned if another key is found, < 0 if there are any errors
  * and 1 is returned if there are no higher keys in the tree
@@ -4245,8 +5595,7 @@ out:
  * calling this function.
  */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *key, int level,
-			int cache_only, u64 min_trans)
+			struct btrfs_key *key, int level, u64 min_trans)
 {
 	int slot;
 	struct extent_buffer *c;
@@ -4279,7 +5628,7 @@ next:
 				btrfs_node_key_to_cpu(c, &cur_key, slot);
 
 			orig_lowest = path->lowest_level;
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			path->lowest_level = level;
 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
 						0, 0);
@@ -4297,21 +5646,8 @@ next:
 		if (level == 0)
 			btrfs_item_key_to_cpu(c, key, slot);
 		else {
-			u64 blockptr = btrfs_node_blockptr(c, slot);
 			u64 gen = btrfs_node_ptr_generation(c, slot);
 
-			if (cache_only) {
-				struct extent_buffer *cur;
-				cur = btrfs_find_tree_block(root, blockptr,
-					    btrfs_level_size(root, level - 1));
-				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
-					slot++;
-					if (cur)
-						free_extent_buffer(cur);
-					goto next;
-				}
-				free_extent_buffer(cur);
-			}
 			if (gen < min_trans) {
 				slot++;
 				goto next;
@@ -4330,6 +5666,12 @@ next:
  */
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
+	return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+			u64 time_seq)
+{
 	int slot;
 	int level;
 	struct extent_buffer *c;
@@ -4338,32 +5680,26 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	u32 nritems;
 	int ret;
 	int old_spinning = path->leave_spinning;
-	int force_blocking = 0;
+	int next_rw_lock = 0;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	if (nritems == 0)
 		return 1;
 
-	/*
-	 * we take the blocks in an order that upsets lockdep.  Using
-	 * blocking mode is the only way around it.
-	 */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	force_blocking = 1;
-#endif
-
 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 again:
 	level = 1;
 	next = NULL;
-	btrfs_release_path(root, path);
+	next_rw_lock = 0;
+	btrfs_release_path(path);
 
 	path->keep_locks = 1;
+	path->leave_spinning = 1;
 
-	if (!force_blocking)
-		path->leave_spinning = 1;
-
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (time_seq)
+		ret = btrfs_search_old_slot(root, &key, path, time_seq);
+	else
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	path->keep_locks = 0;
 
 	if (ret < 0)
@@ -4382,6 +5718,24 @@ again:
 		ret = 0;
 		goto done;
 	}
+	/*
+	 * So the above check misses one case:
+	 * - after releasing the path above, someone has removed the item that
+	 *   used to be at the very end of the block, and balance between leafs
+	 *   gets another one with bigger key.offset to replace it.
+	 *
+	 * This one should be returned as well, or we can get leaf corruption
+	 * later(esp. in __btrfs_drop_extents()).
+	 *
+	 * And a bit more explanation about this check,
+	 * with ret > 0, the key isn't found, the path points to the slot
+	 * where it should be inserted, so the path->slots[0] item must be the
+	 * bigger one.
+	 */
+	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+		ret = 0;
+		goto done;
+	}
 
 	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level]) {
@@ -4401,31 +5755,44 @@ again:
 		}
 
 		if (next) {
-			btrfs_tree_unlock(next);
+			btrfs_tree_unlock_rw(next, next_rw_lock);
 			free_extent_buffer(next);
 		}
 
 		next = c;
+		next_rw_lock = path->locks[level];
 		ret = read_block_for_search(NULL, root, path, &next, level,
-					    slot, &key);
+					    slot, &key, 0);
 		if (ret == -EAGAIN)
 			goto again;
 
 		if (ret < 0) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto done;
 		}
 
 		if (!path->skip_locking) {
-			ret = btrfs_try_spin_lock(next);
+			ret = btrfs_try_tree_read_lock(next);
+			if (!ret && time_seq) {
+				/*
+				 * If we don't get the lock, we may be racing
+				 * with push_leaf_left, holding that lock while
+				 * itself waiting for the leaf we've currently
+				 * locked. To solve this situation, we give up
+				 * on our lock and cycle.
+				 */
+				free_extent_buffer(next);
+				btrfs_release_path(path);
+				cond_resched();
+				goto again;
+			}
 			if (!ret) {
 				btrfs_set_path_blocking(path);
-				btrfs_tree_lock(next);
-				if (!force_blocking)
-					btrfs_clear_path_blocking(path, next);
+				btrfs_tree_read_lock(next);
+				btrfs_clear_path_blocking(path, next,
+							  BTRFS_READ_LOCK);
 			}
-			if (force_blocking)
-				btrfs_set_lock_blocking(next);
+			next_rw_lock = BTRFS_READ_LOCK;
 		}
 		break;
 	}
@@ -4434,43 +5801,40 @@ again:
 		level--;
 		c = path->nodes[level];
 		if (path->locks[level])
-			btrfs_tree_unlock(c);
+			btrfs_tree_unlock_rw(c, path->locks[level]);
 
 		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
 		if (!path->skip_locking)
-			path->locks[level] = 1;
-
+			path->locks[level] = next_rw_lock;
 		if (!level)
 			break;
 
 		ret = read_block_for_search(NULL, root, path, &next, level,
-					    0, &key);
+					    0, &key, 0);
 		if (ret == -EAGAIN)
 			goto again;
 
 		if (ret < 0) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto done;
 		}
 
 		if (!path->skip_locking) {
-			btrfs_assert_tree_locked(path->nodes[level]);
-			ret = btrfs_try_spin_lock(next);
+			ret = btrfs_try_tree_read_lock(next);
 			if (!ret) {
 				btrfs_set_path_blocking(path);
-				btrfs_tree_lock(next);
-				if (!force_blocking)
-					btrfs_clear_path_blocking(path, next);
+				btrfs_tree_read_lock(next);
+				btrfs_clear_path_blocking(path, next,
+							  BTRFS_READ_LOCK);
 			}
-			if (force_blocking)
-				btrfs_set_lock_blocking(next);
+			next_rw_lock = BTRFS_READ_LOCK;
 		}
 	}
 	ret = 0;
 done:
-	unlock_up(path, 0, 1);
+	unlock_up(path, 0, 1, 0, NULL);
 	path->leave_spinning = old_spinning;
 	if (!old_spinning)
 		btrfs_set_path_blocking(path);
@@ -4520,3 +5884,46 @@ int btrfs_previous_item(struct btrfs_root *root,
 	}
 	return 1;
 }
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+
+	while (1) {
+		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    found_key.type == BTRFS_METADATA_ITEM_KEY)
+			return 0;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
+			break;
+	}
+	return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7f78cc78fdd..be91397f4e9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,16 +19,21 @@
 #ifndef __BTRFS_CTREE__
 #define __BTRFS_CTREE__
 
-#include <linux/version.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/kobject.h>
+#include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
+#include <linux/pagemap.h>
+#include <linux/btrfs.h>
+#include <linux/workqueue.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -40,9 +45,18 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BHRfS_M"
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
+
+#define BTRFS_MAX_MIRRORS 3
 
 #define BTRFS_MAX_LEVEL 8
 
@@ -82,6 +96,15 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
+/* for storing items that use the BTRFS_UUID_KEY* types */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -103,6 +126,12 @@ struct btrfs_ordered_sum;
 /* For storing free space cache */
 #define BTRFS_FREE_SPACE_OBJECTID -11ULL
 
+/*
+ * The inode number assigned to the special inode for storing
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -124,12 +153,27 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
 
+#define BTRFS_DEV_REPLACE_DEVID 0ULL
+
+/*
+ * the max metadata block size.  This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
  */
 #define BTRFS_NAME_LEN 255
 
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
 
@@ -141,6 +185,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
 
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS	(1 << 30)
+
 #define BTRFS_FT_UNKNOWN	0
 #define BTRFS_FT_REG_FILE	1
 #define BTRFS_FT_DIR		2
@@ -152,6 +199,11 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_XATTR		8
 #define BTRFS_FT_MAX		9
 
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
+#define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024)
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -185,7 +237,6 @@ struct btrfs_mapping_tree {
 	struct extent_map_tree map_tree;
 };
 
-#define BTRFS_UUID_SIZE 16
 struct btrfs_dev_item {
 	/* the internal btrfs device id */
 	__le64 devid;
@@ -292,14 +343,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 		sizeof(struct btrfs_stripe) * (num_stripes - 1);
 }
 
-#define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
 
 /*
  * File system states
  */
+#define BTRFS_FS_STATE_ERROR		0
+#define BTRFS_FS_STATE_REMOUNTING	1
+#define BTRFS_FS_STATE_TRANS_ABORTED	2
+#define BTRFS_FS_STATE_DEV_REPLACING	3
 
+/* Super block flags */
 /* Errors detected */
 #define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
 
@@ -353,6 +408,47 @@ struct btrfs_header {
 #define BTRFS_LABEL_SIZE 256
 
 /*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+	__le64 tree_root;
+	__le64 tree_root_gen;
+
+	__le64 chunk_root;
+	__le64 chunk_root_gen;
+
+	__le64 extent_root;
+	__le64 extent_root_gen;
+
+	__le64 fs_root;
+	__le64 fs_root_gen;
+
+	__le64 dev_root;
+	__le64 dev_root_gen;
+
+	__le64 csum_root;
+	__le64 csum_root_gen;
+
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 num_devices;
+	/* future */
+	__le64 unused_64[4];
+
+	u8 tree_root_level;
+	u8 chunk_root_level;
+	u8 extent_root_level;
+	u8 fs_root_level;
+	u8 dev_root_level;
+	u8 csum_root_level;
+	/* future and to align */
+	u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
  */
@@ -394,10 +490,12 @@ struct btrfs_super_block {
 	char label[BTRFS_LABEL_SIZE];
 
 	__le64 cache_generation;
+	__le64 uuid_tree_generation;
 
 	/* future expansion */
-	__le64 reserved[31];
+	__le64 reserved[30];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
 
 /*
@@ -408,14 +506,46 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2	(1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy.  Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
+
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
+
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
+	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -480,6 +610,7 @@ struct btrfs_path {
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
 	unsigned int search_commit_root:1;
+	unsigned int need_commit_sem:1;
 };
 
 /*
@@ -508,6 +639,12 @@ struct btrfs_extent_item_v0 {
 /* use full backrefs for extent pointers in the block */
 #define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)
 
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER		(1ULL << 48)
+
 struct btrfs_tree_block_info {
 	struct btrfs_disk_key key;
 	u8 level;
@@ -556,6 +693,14 @@ struct btrfs_inode_ref {
 	/* name goes here */
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_extref {
+	__le64 parent_objectid;
+	__le64 index;
+	__le16 name_len;
+	__u8   name[0];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
 struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
@@ -612,6 +757,12 @@ struct btrfs_dir_item {
 
 #define BTRFS_ROOT_SUBVOL_RDONLY	(1ULL << 0)
 
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD		(1ULL << 48)
+
 struct btrfs_root_item {
 	struct btrfs_inode_item inode;
 	__le64 generation;
@@ -625,6 +776,36 @@ struct btrfs_root_item {
 	struct btrfs_disk_key drop_progress;
 	u8 drop_level;
 	u8 level;
+
+	/*
+	 * The following fields appear after subvol_uuids+subvol_times
+	 * were introduced.
+	 */
+
+	/*
+	 * This generation number is used to test if the new fields are valid
+	 * and up to date while reading the root item. Everytime the root item
+	 * is written out, the "generation" field is copied into this field. If
+	 * anyone ever mounted the fs with an older kernel, we will have
+	 * mismatching generation values here and thus must invalidate the
+	 * new fields. See btrfs_update_root and btrfs_find_last_root for
+	 * details.
+	 * the offset of generation_v2 is also used as the start for the memset
+	 * when invalidating the fields.
+	 */
+	__le64 generation_v2;
+	u8 uuid[BTRFS_UUID_SIZE];
+	u8 parent_uuid[BTRFS_UUID_SIZE];
+	u8 received_uuid[BTRFS_UUID_SIZE];
+	__le64 ctransid; /* updated when an inode changes */
+	__le64 otransid; /* trans when created */
+	__le64 stransid; /* trans when sent. non-zero for received subvol */
+	__le64 rtransid; /* trans when received. non-zero for received subvol */
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec otime;
+	struct btrfs_timespec stime;
+	struct btrfs_timespec rtime;
+	__le64 reserved[8]; /* for future */
 } __attribute__ ((__packed__));
 
 /*
@@ -636,6 +817,57 @@ struct btrfs_root_ref {
 	__le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+	/*
+	 * profiles to operate on, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 profiles;
+
+	/* usage filter */
+	__le64 usage;
+
+	/* devid filter */
+	__le64 devid;
+
+	/* devid subset filter [pstart..pend) */
+	__le64 pstart;
+	__le64 pend;
+
+	/* btrfs virtual address space subset filter [vstart..vend) */
+	__le64 vstart;
+	__le64 vend;
+
+	/*
+	 * profile to convert to, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 target;
+
+	/* BTRFS_BALANCE_ARGS_* */
+	__le64 flags;
+
+	/* BTRFS_BALANCE_ARGS_LIMIT value */
+	__le64 limit;
+
+	__le64 unused[7];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+	/* BTRFS_BALANCE_* */
+	__le64 flags;
+
+	struct btrfs_disk_balance_args data;
+	struct btrfs_disk_balance_args meta;
+	struct btrfs_disk_balance_args sys;
+
+	__le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -694,15 +926,130 @@ struct btrfs_csum_item {
 	u8 csum;
 } __attribute__ ((__packed__));
 
+struct btrfs_dev_stats_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED	0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED		1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED		2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED		3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED		4
+
+struct btrfs_dev_replace {
+	u64 replace_state;	/* see #define above */
+	u64 time_started;	/* seconds since 1-Jan-1970 */
+	u64 time_stopped;	/* seconds since 1-Jan-1970 */
+	atomic64_t num_write_errors;
+	atomic64_t num_uncorrectable_read_errors;
+
+	u64 cursor_left;
+	u64 committed_cursor_left;
+	u64 cursor_left_last_write_of_item;
+	u64 cursor_right;
+
+	u64 cont_reading_from_srcdev_mode;	/* see #define above */
+
+	int is_valid;
+	int item_needs_writeback;
+	struct btrfs_device *srcdev;
+	struct btrfs_device *tgtdev;
+
+	pid_t lock_owner;
+	atomic_t nesting_level;
+	struct mutex lock_finishing_cancel_unmount;
+	struct mutex lock_management_lock;
+	struct mutex lock;
+
+	struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 src_devid;
+	__le64 cursor_left;
+	__le64 cursor_right;
+	__le64 cont_reading_from_srcdev_mode;
+
+	__le64 replace_state;
+	__le64 time_started;
+	__le64 time_stopped;
+	__le64 num_write_errors;
+	__le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES	   5
+#define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA	(1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0		(1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
+#define BTRFS_BLOCK_GROUP_RESERVED	(BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+					 BTRFS_SPACE_INFO_GLOBAL_RSV)
+
+enum btrfs_raid_types {
+	BTRFS_RAID_RAID10,
+	BTRFS_RAID_RAID1,
+	BTRFS_RAID_DUP,
+	BTRFS_RAID_RAID0,
+	BTRFS_RAID_SINGLE,
+	BTRFS_RAID_RAID5,
+	BTRFS_RAID_RAID6,
+	BTRFS_NR_RAID_TYPES
+};
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
+					 BTRFS_BLOCK_GROUP_SYSTEM |  \
+					 BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
+					 BTRFS_BLOCK_GROUP_RAID1 |   \
+					 BTRFS_BLOCK_GROUP_RAID5 |   \
+					 BTRFS_BLOCK_GROUP_RAID6 |   \
+					 BTRFS_BLOCK_GROUP_DUP |     \
+					 BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE	(1ULL << 48)
+
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV	(1ULL << 49)
+
+#define BTRFS_EXTENDED_PROFILE_MASK	(BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+					 BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+		flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+	return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
 
 struct btrfs_block_group_item {
 	__le64 used;
@@ -710,60 +1057,147 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON		(1ULL << 0)
+/*
+ * RESCAN is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN		(1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT	(1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION        1
+
+struct btrfs_qgroup_status_item {
+	__le64 version;
+	/*
+	 * the generation is updated during every commit. As older
+	 * versions of btrfs are not aware of qgroups, it will be
+	 * possible to detect inconsistencies by checking the
+	 * generation on mount time
+	 */
+	__le64 generation;
+
+	/* flag definitions see above */
+	__le64 flags;
+
+	/*
+	 * only used during scanning to record the progress
+	 * of the scan. It contains a logical address
+	 */
+	__le64 rescan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+	__le64 generation;
+	__le64 rfer;
+	__le64 rfer_cmpr;
+	__le64 excl;
+	__le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER	(1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL	(1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER	(1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL	(1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR	(1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR	(1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+	/*
+	 * only updated when any of the other values change
+	 */
+	__le64 flags;
+	__le64 max_rfer;
+	__le64 max_excl;
+	__le64 rsv_rfer;
+	__le64 rsv_excl;
+} __attribute__ ((__packed__));
+
+/* For raid type sysfs entries */
+struct raid_kobject {
+	int raid_type;
+	struct kobject kobj;
+};
+
 struct btrfs_space_info {
-	u64 flags;
+	spinlock_t lock;
 
 	u64 total_bytes;	/* total bytes in the space,
 				   this doesn't take mirrors into account */
 	u64 bytes_used;		/* total bytes used,
-				   this does't take mirrors into account */
+				   this doesn't take mirrors into account */
 	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
 				   transaction finishes */
 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
 				   current allocations */
-	u64 bytes_readonly;	/* total bytes that are read only */
-
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc/allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	unsigned int full:1;	/* indicates that we cannot allocate any more
+				   chunks for this space */
+	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
+
+	unsigned int flush:1;		/* set if we are trying to make space */
+
+	unsigned int force_alloc;	/* set if we need to force a chunk
+					   alloc for this space */
+
 	u64 disk_used;		/* total bytes used on disk */
 	u64 disk_total;		/* total bytes on disk, takes mirrors into
 				   account */
 
+	u64 flags;
+
 	/*
-	 * we bump reservation progress every time we decrement
-	 * bytes_reserved.  This way people waiting for reservations
-	 * know something good has happened and they can check
-	 * for progress.  The number here isn't to be trusted, it
-	 * just shows reclaim activity
+	 * bytes_pinned is kept in line with what is actually pinned, as in
+	 * we've called update_block_group and dropped the bytes_used counter
+	 * and increased the bytes_pinned counter.  However this means that
+	 * bytes_pinned does not reflect the bytes that will be pinned once the
+	 * delayed refs are flushed, so this counter is inc'ed everytime we call
+	 * btrfs_free_extent so it is a realtime count of what will be freed
+	 * once the transaction is committed.  It will be zero'ed everytime the
+	 * transaction commits.
 	 */
-	unsigned long reservation_progress;
-
-	int full;		/* indicates that we cannot allocate any more
-				   chunks for this space */
-	int force_alloc;	/* set if we need to force a chunk alloc for
-				   this space */
+	struct percpu_counter total_bytes_pinned;
 
 	struct list_head list;
 
+	struct rw_semaphore groups_sem;
 	/* for block groups in our same type */
 	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
-	spinlock_t lock;
-	struct rw_semaphore groups_sem;
-	atomic_t caching_threads;
+	wait_queue_head_t wait;
+
+	struct kobject kobj;
+	struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
 };
 
+#define	BTRFS_BLOCK_RSV_GLOBAL		1
+#define	BTRFS_BLOCK_RSV_DELALLOC	2
+#define	BTRFS_BLOCK_RSV_TRANS		3
+#define	BTRFS_BLOCK_RSV_CHUNK		4
+#define	BTRFS_BLOCK_RSV_DELOPS		5
+#define	BTRFS_BLOCK_RSV_EMPTY		6
+#define	BTRFS_BLOCK_RSV_TEMP		7
+
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
-	u64 freed[2];
 	struct btrfs_space_info *space_info;
-	struct list_head list;
 	spinlock_t lock;
-	atomic_t usage;
-	unsigned int priority:8;
-	unsigned int durable:1;
-	unsigned int refill_used:1;
-	unsigned int full:1;
+	unsigned short full;
+	unsigned short type;
+	unsigned short failfast;
 };
 
 /*
@@ -782,9 +1216,6 @@ struct btrfs_free_cluster {
 	/* first extent starting offset */
 	u64 window_start;
 
-	/* if this cluster simply points at a bitmap in the block group */
-	bool points_to_bitmap;
-
 	struct btrfs_block_group_cache *block_group;
 	/*
 	 * when a cluster is allocated from a block group, we put the
@@ -797,7 +1228,9 @@ struct btrfs_free_cluster {
 enum btrfs_caching_type {
 	BTRFS_CACHE_NO		= 0,
 	BTRFS_CACHE_STARTED	= 1,
-	BTRFS_CACHE_FINISHED	= 2,
+	BTRFS_CACHE_FAST	= 2,
+	BTRFS_CACHE_FINISHED	= 3,
+	BTRFS_CACHE_ERROR	= 4,
 };
 
 enum btrfs_disk_cache_state {
@@ -812,6 +1245,7 @@ struct btrfs_caching_control {
 	struct list_head list;
 	struct mutex mutex;
 	wait_queue_head_t wait;
+	struct btrfs_work work;
 	struct btrfs_block_group_cache *block_group;
 	u64 progress;
 	atomic_t count;
@@ -825,13 +1259,22 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
-	u64 reserved_pinned;
+	u64 delalloc_bytes;
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
-	int extents_thresh;
-	int free_extents;
-	int total_bitmaps;
+	u64 cache_generation;
+
+	/*
+	 * It is just used for the delayed data space allocation because
+	 * only the data space allocation and the relative metadata update
+	 * can be done cross the transaction.
+	 */
+	struct rw_semaphore data_rwsem;
+
+	/* for raid56, this is a full stripe, without parity */
+	unsigned long full_stripe_len;
+
 	unsigned int ro:1;
 	unsigned int dirty:1;
 	unsigned int iref:1;
@@ -846,9 +1289,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_space_info *space_info;
 
 	/* free space cache stuff */
-	spinlock_t tree_lock;
-	struct rb_root free_space_offset;
-	u64 free_space;
+	struct btrfs_free_space_ctl *free_space_ctl;
 
 	/* block group cache stuff */
 	struct rb_node cache_node;
@@ -863,11 +1304,47 @@ struct btrfs_block_group_cache {
 	 * Today it will only have one thing on it, but that may change
 	 */
 	struct list_head cluster_list;
+
+	/* For delayed block group creation */
+	struct list_head new_bg_list;
+};
+
+/* delayed seq elem */
+struct seq_list {
+	struct list_head list;
+	u64 seq;
+};
+
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
 };
 
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+	struct list_head hash_list;
+	wait_queue_head_t wait;
+	spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+	struct list_head stripe_cache;
+	spinlock_t cache_lock;
+	int cache_size;
+	struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+void btrfs_init_async_reclaim_work(struct work_struct *work);
+
+/* fs_info */
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -877,6 +1354,8 @@ struct btrfs_fs_info {
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
 	struct btrfs_root *csum_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -886,15 +1365,23 @@ struct btrfs_fs_info {
 
 	/* block group cache stuff */
 	spinlock_t block_group_cache_lock;
+	u64 first_logical_byte;
 	struct rb_root block_group_cache_tree;
 
+	/* keep track of unallocated space */
+	spinlock_t free_chunk_lock;
+	u64 free_chunk_space;
+
 	struct extent_io_tree freed_extents[2];
 	struct extent_io_tree *pinned_extents;
 
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
 
-	/* block reservation for extent, checksum and root tree */
+	/*
+	 * block reservation for extent, checksum, root tree and
+	 * delayed dir index item
+	 */
 	struct btrfs_block_rsv global_block_rsv;
 	/* block reservation for delay allocation */
 	struct btrfs_block_rsv delalloc_block_rsv;
@@ -902,26 +1389,40 @@ struct btrfs_fs_info {
 	struct btrfs_block_rsv trans_block_rsv;
 	/* block reservation for chunk tree */
 	struct btrfs_block_rsv chunk_block_rsv;
+	/* block reservation for delayed operations */
+	struct btrfs_block_rsv delayed_block_rsv;
 
 	struct btrfs_block_rsv empty_block_rsv;
 
-	/* list of block reservations that cross multiple transactions */
-	struct list_head durable_block_rsv_list;
-
-	struct mutex durable_block_rsv_mutex;
-
 	u64 generation;
 	u64 last_trans_committed;
+	u64 avg_delayed_ref_runtime;
 
 	/*
 	 * this is updated to the current trans every time a full commit
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	u64 open_ioctl_trans;
-	unsigned long mount_opt:20;
+	unsigned long mount_opt;
 	unsigned long compress_type:4;
+	int commit_interval;
+	/*
+	 * It is a suggestive number, the read side is safe even it gets a
+	 * wrong number because we will write out the data into a regular
+	 * extent. The write side(mount/remount) is under ->s_umount lock,
+	 * so it is also safe.
+	 */
 	u64 max_inline;
+	/*
+	 * Protected by ->chunk_mutex and sb->s_umount.
+	 *
+	 * The reason that we use two lock to protect it is because only
+	 * remount and mount operations can change it and these two operations
+	 * are under sb->s_umount, but the read side (chunk allocation) can not
+	 * acquire sb->s_umount or the deadlock would happen. So we use two
+	 * locks to protect it. On the write side, we must acquire two locks,
+	 * and on the read side, we just need acquire one of them.
+	 */
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
@@ -929,18 +1430,35 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
-	struct btrfs_super_block super_copy;
-	struct btrfs_super_block super_for_commit;
+	/*
+	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+	 * when they are updated.
+	 *
+	 * Because we do not clear the flags for ever, so we needn't use
+	 * the lock on the read side.
+	 *
+	 * We also needn't use the lock when we mount the fs, because
+	 * there is no other task which will update the flag.
+	 */
+	spinlock_t super_lock;
+	struct btrfs_super_block *super_copy;
+	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
 	struct backing_dev_info bdi;
-	struct mutex trans_mutex;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
+
+	/* this is used during read/modify/write to make sure
+	 * no two ios are trying to mod the same stripe at the same
+	 * time
+	 */
+	struct btrfs_stripe_hash_table *stripe_hash_table;
+
 	/*
 	 * this protects the ordered operations list only while we are
 	 * processing all of the entries on it.  This way we make
@@ -949,52 +1467,67 @@ struct btrfs_fs_info {
 	 * before jumping into the main commit.
 	 */
 	struct mutex ordered_operations_mutex;
-	struct rw_semaphore extent_commit_sem;
+
+	/*
+	 * Same as ordered_operations_mutex except this is for ordered extents
+	 * and not the operations.
+	 */
+	struct mutex ordered_extent_flush_mutex;
+
+	struct rw_semaphore commit_root_sem;
 
 	struct rw_semaphore cleanup_work_sem;
 
 	struct rw_semaphore subvol_sem;
 	struct srcu_struct subvol_srcu;
 
+	spinlock_t trans_lock;
+	/*
+	 * the reloc mutex goes with the trans lock, it is taken
+	 * during commit to protect us from the relocation code
+	 */
+	struct mutex reloc_mutex;
+
 	struct list_head trans_list;
-	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head caching_block_groups;
 
 	spinlock_t delayed_iput_lock;
 	struct list_head delayed_iputs;
 
+	/* this protects tree_mod_seq_list */
+	spinlock_t tree_mod_seq_lock;
+	atomic64_t tree_mod_seq;
+	struct list_head tree_mod_seq_list;
+
+	/* this protects tree_mod_log */
+	rwlock_t tree_mod_log_lock;
+	struct rb_root tree_mod_log;
+
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
+	atomic_t open_ioctl_trans;
 
 	/*
-	 * this is used by the balancing code to wait for all the pending
-	 * ordered extents
+	 * this is used to protect the following list -- ordered_roots.
 	 */
-	spinlock_t ordered_extent_lock;
+	spinlock_t ordered_root_lock;
 
 	/*
-	 * all of the data=ordered extents pending writeback
+	 * all fs/file tree roots in which there are data=ordered extents
+	 * pending writeback are added into this list.
+	 *
 	 * these can span multiple transactions and basically include
 	 * every dirty data page that isn't from nodatacow
 	 */
-	struct list_head ordered_extents;
+	struct list_head ordered_roots;
 
-	/*
-	 * all of the inodes that have delalloc bytes.  It is possible for
-	 * this list to be empty even when there is still dirty data=ordered
-	 * extents waiting to finish IO.
-	 */
-	struct list_head delalloc_inodes;
-
-	/*
-	 * special rename and truncate targets that must be on disk before
-	 * we're allowed to commit.  This is basically the ext3 style
-	 * data=ordered list.
-	 */
-	struct list_head ordered_operations;
+	struct mutex delalloc_root_mutex;
+	spinlock_t delalloc_root_lock;
+	/* all fs/file tree roots that have delalloc inodes. */
+	struct list_head delalloc_roots;
 
 	/*
 	 * there is a pool of worker threads for checksumming during writes
@@ -1006,38 +1539,50 @@ struct btrfs_fs_info {
 	 * A third pool does submit_bio to avoid deadlocking with the other
 	 * two
 	 */
-	struct btrfs_workers generic_worker;
-	struct btrfs_workers workers;
-	struct btrfs_workers delalloc_workers;
-	struct btrfs_workers endio_workers;
-	struct btrfs_workers endio_meta_workers;
-	struct btrfs_workers endio_meta_write_workers;
-	struct btrfs_workers endio_write_workers;
-	struct btrfs_workers endio_freespace_worker;
-	struct btrfs_workers submit_workers;
+	struct btrfs_workqueue *workers;
+	struct btrfs_workqueue *delalloc_workers;
+	struct btrfs_workqueue *flush_workers;
+	struct btrfs_workqueue *endio_workers;
+	struct btrfs_workqueue *endio_meta_workers;
+	struct btrfs_workqueue *endio_raid56_workers;
+	struct btrfs_workqueue *rmw_workers;
+	struct btrfs_workqueue *endio_meta_write_workers;
+	struct btrfs_workqueue *endio_write_workers;
+	struct btrfs_workqueue *endio_freespace_worker;
+	struct btrfs_workqueue *submit_workers;
+	struct btrfs_workqueue *caching_workers;
+	struct btrfs_workqueue *readahead_workers;
+
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
 	 * the cow mechanism and make them safe to write.  It happens
 	 * for the sys_munmap function call path
 	 */
-	struct btrfs_workers fixup_workers;
+	struct btrfs_workqueue *fixup_workers;
+	struct btrfs_workqueue *delayed_workers;
+
+	/* the extent workers do delayed refs on the extent allocation tree */
+	struct btrfs_workqueue *extent_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
 	struct kobject super_kobj;
+	struct kobject *space_info_kobj;
+	struct kobject *device_dir_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
-	int enospc_unlink;
 
 	u64 total_pinned;
 
-	/* protected by the delalloc lock, used to keep from writing
-	 * metadata until there is a nice batch
-	 */
-	u64 dirty_metadata_bytes;
+	/* used to keep from writing metadata until there is a nice batch */
+	struct percpu_counter dirty_metadata_bytes;
+	struct percpu_counter delalloc_bytes;
+	s32 dirty_metadata_batch;
+	s32 delalloc_batch;
+
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
@@ -1049,11 +1594,9 @@ struct btrfs_fs_info {
 	 */
 	struct list_head space_info;
 
-	struct reloc_control *reloc_ctl;
+	struct btrfs_space_info *data_sinfo;
 
-	spinlock_t delalloc_lock;
-	spinlock_t new_trans_lock;
-	u64 delalloc_bytes;
+	struct reloc_control *reloc_ctl;
 
 	/* data_alloc_cluster is only used in ssd mode */
 	struct btrfs_free_cluster data_alloc_cluster;
@@ -1061,39 +1604,164 @@ struct btrfs_fs_info {
 	/* all metadata allocations go through this cluster */
 	struct btrfs_free_cluster meta_alloc_cluster;
 
-	spinlock_t ref_cache_lock;
-	u64 total_ref_cache_size;
+	/* auto defrag inodes go here */
+	spinlock_t defrag_inodes_lock;
+	struct rb_root defrag_inodes;
+	atomic_t defrag_running;
 
+	/* Used to protect avail_{data, metadata, system}_alloc_bits */
+	seqlock_t profiles_lock;
+	/*
+	 * these three are in extended format (availability of single
+	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+	 */
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
-	u64 data_alloc_profile;
-	u64 metadata_alloc_profile;
-	u64 system_alloc_profile;
+
+	/* restriper state */
+	spinlock_t balance_lock;
+	struct mutex balance_mutex;
+	atomic_t balance_running;
+	atomic_t balance_pause_req;
+	atomic_t balance_cancel_req;
+	struct btrfs_balance_control *balance_ctl;
+	wait_queue_head_t balance_wait_q;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
 
 	void *bdev_holder;
 
+	/* private scrub information */
+	struct mutex scrub_lock;
+	atomic_t scrubs_running;
+	atomic_t scrub_pause_req;
+	atomic_t scrubs_paused;
+	atomic_t scrub_cancel_req;
+	wait_queue_head_t scrub_pause_wait;
+	int scrub_workers_refcnt;
+	struct btrfs_workqueue *scrub_workers;
+	struct btrfs_workqueue *scrub_wr_completion_workers;
+	struct btrfs_workqueue *scrub_nocow_workers;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	u32 check_integrity_print_mask;
+#endif
+	/*
+	 * quota information
+	 */
+	unsigned int quota_enabled:1;
+
+	/*
+	 * quota_enabled only changes state after a commit. This holds the
+	 * next state.
+	 */
+	unsigned int pending_quota_state:1;
+
+	/* is qgroup tracking in a consistent state? */
+	u64 qgroup_flags;
+
+	/* holds configuration and tracking. Protected by qgroup_lock */
+	struct rb_root qgroup_tree;
+	struct rb_root qgroup_op_tree;
+	spinlock_t qgroup_lock;
+	spinlock_t qgroup_op_lock;
+	atomic_t qgroup_op_seq;
+
+	/*
+	 * used to avoid frequently calling ulist_alloc()/ulist_free()
+	 * when doing qgroup accounting, it must be protected by qgroup_lock.
+	 */
+	struct ulist *qgroup_ulist;
+
+	/* protect user change for quota operations */
+	struct mutex qgroup_ioctl_lock;
+
+	/* list of dirty qgroups to be written at next commit */
+	struct list_head dirty_qgroups;
+
+	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+	u64 qgroup_seq;
+
+	/* qgroup rescan items */
+	struct mutex qgroup_rescan_lock; /* protects the progress item */
+	struct btrfs_key qgroup_rescan_progress;
+	struct btrfs_workqueue *qgroup_rescan_workers;
+	struct completion qgroup_rescan_completion;
+	struct btrfs_work qgroup_rescan_work;
+
 	/* filesystem state */
-	u64 fs_state;
+	unsigned long fs_state;
+
+	struct btrfs_delayed_root *delayed_root;
+
+	/* readahead tree */
+	spinlock_t reada_lock;
+	struct radix_tree_root reada_tree;
+
+	/* Extent buffer radix tree */
+	spinlock_t buffer_lock;
+	struct radix_tree_root buffer_radix;
+
+	/* next backup root to be overwritten */
+	int backup_root_index;
+
+	int num_tolerated_disk_barrier_failures;
+
+	/* device replace state */
+	struct btrfs_dev_replace dev_replace;
+
+	atomic_t mutually_exclusive_operation_running;
+
+	struct percpu_counter bio_counter;
+	wait_queue_head_t replace_wait;
+
+	struct semaphore uuid_tree_rescan_sem;
+	unsigned int update_uuid_tree_gen:1;
+
+	/* Used to reclaim the metadata space in the background. */
+	struct work_struct async_reclaim_work;
+};
+
+struct btrfs_subvolume_writers {
+	struct percpu_counter	counter;
+	wait_queue_head_t	wait;
 };
 
 /*
+ * The state of btrfs root
+ */
+/*
+ * btrfs_record_root_in_trans is a multi-step process,
+ * and it can race with the balancing code.   But the
+ * race is very small, and only the first time the root
+ * is added to each transaction.  So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+#define BTRFS_ROOT_IN_TRANS_SETUP	0
+#define BTRFS_ROOT_REF_COWS		1
+#define BTRFS_ROOT_TRACK_DIRTY		2
+#define BTRFS_ROOT_IN_RADIX		3
+#define BTRFS_ROOT_DUMMY_ROOT		4
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED	5
+#define BTRFS_ROOT_DEFRAG_RUNNING	6
+#define BTRFS_ROOT_FORCE_COW		7
+#define BTRFS_ROOT_MULTI_LOG_TASKS	8
+
+/*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
 struct btrfs_root {
 	struct extent_buffer *node;
 
-	/* the node lock is held while changing the node pointer */
-	spinlock_t node_lock;
-
 	struct extent_buffer *commit_root;
 	struct btrfs_root *log_root;
 	struct btrfs_root *reloc_root;
 
+	unsigned long state;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -1106,16 +1774,28 @@ struct btrfs_root {
 	spinlock_t accounting_lock;
 	struct btrfs_block_rsv *block_rsv;
 
+	/* free ino cache stuff */
+	struct btrfs_free_space_ctl *free_ino_ctl;
+	enum btrfs_caching_type cached;
+	spinlock_t cache_lock;
+	wait_queue_head_t cache_wait;
+	struct btrfs_free_space_ctl *free_ino_pinned;
+	u64 cache_progress;
+	struct inode *cache_inode;
+
 	struct mutex log_mutex;
 	wait_queue_head_t log_writer_wait;
 	wait_queue_head_t log_commit_wait[2];
+	struct list_head log_ctxs[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
-	unsigned long log_transid;
-	unsigned long last_log_commit;
-	unsigned long log_batch;
+	atomic_t log_batch;
+	int log_transid;
+	/* No matter the commit succeeds or not*/
+	int log_transid_committed;
+	/* Just be updated when the commit succeeds. */
+	int last_log_commit;
 	pid_t log_start_pid;
-	bool log_multiple_pids;
 
 	u64 objectid;
 	u64 last_trans;
@@ -1134,26 +1814,27 @@ struct btrfs_root {
 	u32 type;
 
 	u64 highest_objectid;
-	int ref_cows;
-	int track_dirty;
-	int in_radix;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	u64 alloc_bytenr;
+#endif
 
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
-	int defrag_running;
 	char *name;
-	int in_sysfs;
 
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
 	struct list_head root_list;
 
+	spinlock_t log_extents_lock[2];
+	struct list_head logged_list[2];
+
 	spinlock_t orphan_lock;
-	struct list_head orphan_list;
+	atomic_t orphan_inodes;
 	struct btrfs_block_rsv *orphan_block_rsv;
-	int orphan_item_inserted;
 	int orphan_cleanup_state;
 
 	spinlock_t inode_lock;
@@ -1161,12 +1842,87 @@ struct btrfs_root {
 	struct rb_root inode_tree;
 
 	/*
+	 * radix tree that keeps track of delayed nodes of every inode,
+	 * protected by inode_lock
+	 */
+	struct radix_tree_root delayed_nodes_tree;
+	/*
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
 	 */
-	struct super_block anon_super;
+	dev_t anon_dev;
+
+	spinlock_t root_item_lock;
+	atomic_t refs;
+
+	struct mutex delalloc_mutex;
+	spinlock_t delalloc_lock;
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
+	struct list_head delalloc_inodes;
+	struct list_head delalloc_root;
+	u64 nr_delalloc_inodes;
+
+	struct mutex ordered_extent_mutex;
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
+	struct list_head ordered_extents;
+	struct list_head ordered_root;
+	u64 nr_ordered_extents;
+
+	/*
+	 * Number of currently running SEND ioctls to prevent
+	 * manipulation with the read-only status via SUBVOL_SETFLAGS
+	 */
+	int send_in_progress;
+	struct btrfs_subvolume_writers *subv_writers;
+	atomic_t will_be_snapshoted;
+};
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/*
+	 * which compression method to use if turning on compression
+	 * for this defrag operation.  If unspecified, zlib will
+	 * be used
+	 */
+	__u32 compress_type;
+
+	/* spare for later */
+	__u32 unused[4];
 };
 
+
 /*
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
@@ -1174,6 +1930,7 @@ struct btrfs_root {
  */
 #define BTRFS_INODE_ITEM_KEY		1
 #define BTRFS_INODE_REF_KEY		12
+#define BTRFS_INODE_EXTREF_KEY		13
 #define BTRFS_XATTR_ITEM_KEY		24
 #define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
@@ -1222,6 +1979,12 @@ struct btrfs_root {
  */
 #define BTRFS_EXTENT_ITEM_KEY	168
 
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY	169
+
 #define BTRFS_TREE_BLOCK_REF_KEY	176
 
 #define BTRFS_EXTENT_DATA_REF_KEY	178
@@ -1243,11 +2006,67 @@ struct btrfs_root {
 #define BTRFS_CHUNK_ITEM_KEY	228
 
 /*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY         240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY           242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY          244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY       246
+
+#define BTRFS_BALANCE_ITEM_KEY	248
+
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY	249
+
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY	250
+
+/*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the filesystem UUID tree.
+ * The key is built like this:
+ * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
+ */
+#if BTRFS_UUID_SIZE != 16
+#error "UUID items require BTRFS_UUID_SIZE == 16!"
+#endif
+#define BTRFS_UUID_KEY_SUBVOL	251	/* for UUIDs assigned to subvols */
+#define BTRFS_UUID_KEY_RECEIVED_SUBVOL	252	/* for UUIDs assigned to
+						 * received subvols */
+
+/*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
 #define BTRFS_STRING_ITEM_KEY	253
 
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
 #define BTRFS_MOUNT_NODATASUM		(1 << 0)
 #define BTRFS_MOUNT_NODATACOW		(1 << 1)
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
@@ -1264,11 +2083,37 @@ struct btrfs_root {
 #define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
+#define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
+#define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+#define	BTRFS_MOUNT_CHANGE_INODE_CACHE	(1 << 24)
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
 					 BTRFS_MOUNT_##opt)
+#define btrfs_set_and_info(root, opt, fmt, args...)			\
+{									\
+	if (!btrfs_test_opt(root, opt))					\
+		btrfs_info(root->fs_info, fmt, ##args);			\
+	btrfs_set_opt(root->fs_info->mount_opt, opt);			\
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...)			\
+{									\
+	if (btrfs_test_opt(root, opt))					\
+		btrfs_info(root->fs_info, fmt, ##args);			\
+	btrfs_clear_opt(root->fs_info->mount_opt, opt);			\
+}
+
 /*
  * Inode flags
  */
@@ -1283,6 +2128,20 @@ struct btrfs_root {
 #define BTRFS_INODE_NODUMP		(1 << 8)
 #define BTRFS_INODE_NOATIME		(1 << 9)
 #define BTRFS_INODE_DIRSYNC		(1 << 10)
+#define BTRFS_INODE_COMPRESS		(1 << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
+
+struct btrfs_map_token {
+	struct extent_buffer *eb;
+	char *kaddr;
+	unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+	token->kaddr = NULL;
+}
 
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -1304,26 +2163,67 @@ struct btrfs_root {
 			    offsetof(type, member),			\
 			   sizeof(((type *)0)->member)))
 
-#ifndef BTRFS_SETGET_FUNCS
+#define DECLARE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			       unsigned long off,			\
+                              struct btrfs_map_token *token);		\
+void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			    unsigned long off, u##bits val,		\
+			    struct btrfs_map_token *token);		\
+static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+				       unsigned long off)		\
+{									\
+	return btrfs_get_token_##bits(eb, ptr, off, NULL);		\
+}									\
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+				    unsigned long off, u##bits val)	\
+{									\
+       btrfs_set_token_##bits(eb, ptr, off, val, NULL);			\
+}
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
-void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
-#endif
+static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb, type *s,	\
+				    u##bits val)			\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
+}									\
+static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+					 struct btrfs_map_token *token)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+}									\
+static inline void btrfs_set_token_##name(struct extent_buffer *eb,	\
+					  type *s, u##bits val,		\
+                                         struct btrfs_map_token *token)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+}
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 {									\
-	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	type *p = page_address(eb->pages[0]);				\
 	u##bits res = le##bits##_to_cpu(p->member);			\
-	kunmap_atomic(p, KM_USER0);					\
 	return res;							\
 }									\
 static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 				    u##bits val)			\
 {									\
-	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	type *p = page_address(eb->pages[0]);				\
 	p->member = cpu_to_le##bits(val);				\
-	kunmap_atomic(p, KM_USER0);					\
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
@@ -1371,14 +2271,14 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
 BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
 			 generation, 64);
 
-static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
 {
-	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
-static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
 {
-	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
 }
 
 BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
@@ -1436,26 +2336,12 @@ static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
 	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
 }
 
-static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
-					     struct btrfs_chunk *c, int nr,
-					     u64 val)
-{
-	btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
-}
-
 static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
 					 struct btrfs_chunk *c, int nr)
 {
 	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
 }
 
-static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
-					     struct btrfs_chunk *c, int nr,
-					     u64 val)
-{
-	btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
-}
-
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
@@ -1475,6 +2361,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+		   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+		   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -1488,6 +2381,23 @@ BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+			 sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+			 transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+			 nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+			 block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
 
 static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
@@ -1513,16 +2423,10 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
 	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_timespec *
-btrfs_inode_otime(struct btrfs_inode_item *inode_item)
-{
-	unsigned long ptr = (unsigned long)inode_item;
-	ptr += offsetof(struct btrfs_inode_item, otime);
-	return (struct btrfs_timespec *)ptr;
-}
-
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
@@ -1533,10 +2437,10 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
 		   chunk_offset, 64);
 BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
 
-static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 {
 	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
-	return (u8 *)((unsigned long)dev + ptr);
+	return (unsigned long)dev + ptr;
 }
 
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
@@ -1604,6 +2508,10 @@ BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
 BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
+			 blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+			 generation, 64);
 
 static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
 {
@@ -1660,6 +2568,8 @@ static inline void btrfs_set_node_key(struct extent_buffer *eb,
 /* struct btrfs_item */
 BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
 BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
 
 static inline unsigned long btrfs_item_nr_offset(int nr)
 {
@@ -1667,8 +2577,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr)
 		sizeof(struct btrfs_item) * nr;
 }
 
-static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
-					       int nr)
+static inline struct btrfs_item *btrfs_item_nr(int nr)
 {
 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
@@ -1681,30 +2590,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb,
 
 static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_offset(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
 static inline void btrfs_item_key(struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	struct btrfs_item *item = btrfs_item_nr(nr);
 	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
 static inline void btrfs_set_item_key(struct extent_buffer *eb,
 			       struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	struct btrfs_item *item = btrfs_item_nr(nr);
 	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
@@ -1722,6 +2631,13 @@ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
 BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
+			 data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
+			 name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
+			 transid, 64);
 
 static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 				      struct btrfs_dir_item *item,
@@ -1824,6 +2740,12 @@ BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
 BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
+			 nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
 
 static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
 {
@@ -1859,43 +2781,14 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
 	btrfs_set_header_flags(eb, flags);
 }
 
-static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(void)
 {
-	unsigned long ptr = offsetof(struct btrfs_header, fsid);
-	return (u8 *)ptr;
+	return offsetof(struct btrfs_header, fsid);
 }
 
-static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
 {
-	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
-	return (u8 *)ptr;
-}
-
-static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
-{
-	unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
-	return (u8 *)ptr;
-}
-
-static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
-{
-	unsigned long ptr = offsetof(struct btrfs_header, csum);
-	return (u8 *)ptr;
-}
-
-static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
-{
-	return NULL;
-}
-
-static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
-{
-	return NULL;
-}
-
-static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
-{
-	return NULL;
+	return offsetof(struct btrfs_header, chunk_tree_uuid);
 }
 
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
@@ -1921,14 +2814,158 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
 BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 			 last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+			 generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+			 ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+			 otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+			 stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+			 rtransid, 64);
 
 static inline bool btrfs_root_readonly(struct btrfs_root *root)
 {
-	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
 }
 
-/* struct btrfs_super_block */
+static inline bool btrfs_root_dead(struct btrfs_root *root)
+{
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+		   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+		   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+		   tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+		   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+		   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+		   chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+		   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+		   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+		   extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+		   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+		   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+		   fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+		   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+		   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+		   dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+		   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+		   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+		   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+		   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+		   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+		   num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+				     struct btrfs_balance_item *bi,
+				     struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+					 struct btrfs_balance_item *bi,
+					 struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+			       struct btrfs_disk_balance_args *disk)
+{
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->profiles = le64_to_cpu(disk->profiles);
+	cpu->usage = le64_to_cpu(disk->usage);
+	cpu->devid = le64_to_cpu(disk->devid);
+	cpu->pstart = le64_to_cpu(disk->pstart);
+	cpu->pend = le64_to_cpu(disk->pend);
+	cpu->vstart = le64_to_cpu(disk->vstart);
+	cpu->vend = le64_to_cpu(disk->vend);
+	cpu->target = le64_to_cpu(disk->target);
+	cpu->flags = le64_to_cpu(disk->flags);
+	cpu->limit = le64_to_cpu(disk->limit);
+}
 
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+			       struct btrfs_balance_args *cpu)
+{
+	memset(disk, 0, sizeof(*disk));
+
+	disk->profiles = cpu_to_le64(cpu->profiles);
+	disk->usage = cpu_to_le64(cpu->usage);
+	disk->devid = cpu_to_le64(cpu->devid);
+	disk->pstart = cpu_to_le64(cpu->pstart);
+	disk->pend = cpu_to_le64(cpu->pend);
+	disk->vstart = cpu_to_le64(cpu->vstart);
+	disk->vend = cpu_to_le64(cpu->vend);
+	disk->target = cpu_to_le64(cpu->target);
+	disk->flags = cpu_to_le64(cpu->flags);
+	disk->limit = cpu_to_le64(cpu->limit);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -1976,11 +3013,16 @@ BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
 			 csum_type, 16);
 BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 			 cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+			 uuid_tree_generation, 64);
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
-	int t = btrfs_super_csum_type(s);
-	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	u16 t = btrfs_super_csum_type(s);
+	/*
+	 * csum type is validated at mount time
+	 */
 	return btrfs_csum_sizes[t];
 }
 
@@ -1991,6 +3033,18 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+			 struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+			 struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+			 struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+			 struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+			 struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+			 struct btrfs_file_extent_item, compression, 8);
 
 static inline unsigned long
 btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
@@ -2024,15 +3078,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
 BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
 		   other_encoding, 16);
 
-/* this returns the number of file bytes represented by the inline item.
- * If an item is compressed, this is the uncompressed size
- */
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       struct btrfs_file_extent_item *e)
-{
-	return btrfs_file_extent_ram_bytes(eb, e);
-}
-
 /*
  * this returns the number of bytes used by the item on disk, minus the
  * size of any extent headers.  If a file is compressed on disk, this is
@@ -2046,25 +3091,145 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 	return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       int slot,
+					       struct btrfs_file_extent_item *fi)
 {
-	return sb->s_fs_info;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+	/*
+	 * return the space used on disk if this item isn't
+	 * compressed or encoded
+	 */
+	if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
+	    btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
+	    btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
+		return btrfs_file_extent_inline_item_len(eb,
+							 btrfs_item_nr(slot));
+	}
+
+	/* otherwise use the ram bytes field */
+	return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
 }
 
-static inline int btrfs_set_root_name(struct btrfs_root *root,
-				      const char *name, int len)
+
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+					struct btrfs_dev_stats_item *ptr,
+					int index)
 {
-	/* if we already have a name just free it */
-	kfree(root->name);
+	u64 val;
 
-	root->name = kmalloc(len+1, GFP_KERNEL);
-	if (!root->name)
-		return -ENOMEM;
+	read_extent_buffer(eb, &val,
+			   offsetof(struct btrfs_dev_stats_item, values) +
+			    ((unsigned long)ptr) + (index * sizeof(u64)),
+			   sizeof(val));
+	return val;
+}
 
-	memcpy(root->name, name, len);
-	root->name[len] = '\0';
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+					     struct btrfs_dev_stats_item *ptr,
+					     int index, u64 val)
+{
+	write_extent_buffer(eb, &val,
+			    offsetof(struct btrfs_dev_stats_item, values) +
+			     ((unsigned long)ptr) + (index * sizeof(u64)),
+			    sizeof(val));
+}
 
-	return 0;
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+		   version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+		   rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+		   rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+		   excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+			 struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+			 rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+			 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+			 excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+			 struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+		   max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+		   max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+		   rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+		   rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+		   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+		   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+		   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+		   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+		   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+		   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+		   cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+			 struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+			 struct btrfs_dev_replace_item,
+			 cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+			 struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+			 struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+			 struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+			 struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+			 struct btrfs_dev_replace_item,
+			 num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+			 struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+			 struct btrfs_dev_replace_item, cursor_right, 64);
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
 }
 
 static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
@@ -2083,39 +3248,63 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
 	((unsigned long)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset_nr(leaf, slot)))
 
-static inline struct dentry *fdentry(struct file *file)
-{
-	return file->f_path.dentry;
-}
-
 static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 {
 	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
 		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
 }
 
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+	return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
 /* extent-tree.c */
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+						 unsigned num_items)
+{
+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		2 * num_items;
+}
+
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+						 unsigned num_items)
+{
+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		num_items;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+				 unsigned long count, int wait);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags);
+			     u64 offset, int metadata, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
 		     u64 bytenr, u64 num, int reserved);
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+				 struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr);
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-u64 btrfs_find_block_group(struct btrfs_root *root,
-			   u64 search_start, u64 search_hint, int owner);
+int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
@@ -2125,10 +3314,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref);
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize,
-					    int level);
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
@@ -2137,34 +3322,34 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   u64 root_objectid, u64 owner, u64 offset,
 				   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 num_bytes, u64 min_alloc_size,
-				  u64 empty_size, u64 hint_byte,
-				  u64 search_end, struct btrfs_key *ins,
-				  u64 data);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+			 struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int no_quota);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int no_quota);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data);
+				int level, int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset);
-
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int no_quota);
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+			       int delalloc);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset);
+			 u64 root_objectid, u64 owner, u64 offset, int no_quota);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
@@ -2178,59 +3363,82 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+	/* If we are in the transaction, we can't flush anything.*/
+	BTRFS_RESERVE_NO_FLUSH,
+	/*
+	 * Flushing delalloc may cause deadlock somewhere, in this
+	 * case, use FLUSH LIMIT
+	 */
+	BTRFS_RESERVE_FLUSH_LIMIT,
+	BTRFS_RESERVE_FLUSH_ALL,
+};
+
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int nitems,
+				     u64 *qgroup_reserved, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor);
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor);
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
-int btrfs_set_block_group_rw(struct btrfs_root *root,
-			     struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+			      struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 int btrfs_error_unpin_extent_range(struct btrfs_root *root,
 				   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
-			       u64 num_bytes);
+			       u64 num_bytes, u64 *actual_bytes);
 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 type);
-
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2238,18 +3446,34 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *new_key);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid);
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+			     struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
-			int cache_only, u64 min_trans);
+			u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-			 struct btrfs_key *max_key,
-			 struct btrfs_path *path, int cache_only,
+			 struct btrfs_path *path,
 			 u64 min_trans);
+enum btrfs_compare_tree_result {
+	BTRFS_COMPARE_TREE_NEW,
+	BTRFS_COMPARE_TREE_DELETED,
+	BTRFS_COMPARE_TREE_CHANGED,
+	BTRFS_COMPARE_TREE_SAME,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
+				  struct btrfs_root *right_root,
+				  struct btrfs_path *left_path,
+				  struct btrfs_path *right_path,
+				  struct btrfs_key *key,
+				  enum btrfs_compare_tree_result result,
+				  void *ctx);
+int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t cb, void *ctx);
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -2260,12 +3484,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_block_can_be_shared(struct btrfs_root *root,
 			      struct extent_buffer *buf);
-int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, u32 data_size);
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+		       u32 data_size);
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+			 u32 new_size, int from_end);
 int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_path *path,
@@ -2275,17 +3497,26 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 struct btrfs_path *path,
 			 struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+		u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+			  struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, int cache_only, u64 *last_ret,
+		       int start_slot, u64 *last_ret,
 		       struct btrfs_key *progress);
-void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+			       struct extent_buffer *held, int held_rw);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2297,13 +3528,11 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 	return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
 
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    u32 total_data, u32 total_size, int nr);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, void *data, u32 data_size);
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path,
@@ -2320,13 +3549,71 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+			u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+				      struct btrfs_path *p, u64 time_seq)
+{
+	++p->slots[0];
+	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+		return btrfs_next_old_leaf(root, p, time_seq);
+	return 0;
+}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+	return btrfs_next_old_item(root, p, 0);
+}
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv, int update_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+				     struct btrfs_block_rsv *block_rsv,
+				     int update_ref, int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * Get synced with close_ctree()
+	 */
+	smp_mb();
+	return fs_info->closing;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+	return (root->fs_info->sb->s_flags & MS_RDONLY ||
+		btrfs_fs_closing(root->fs_info));
+}
+
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+	kfree(fs_info->balance_ctl);
+	kfree(fs_info->delayed_root);
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	kfree(fs_info->quota_root);
+	kfree(fs_info->uuid_root);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->super_for_commit);
+	kfree(fs_info);
+}
+
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 			struct btrfs_path *path,
@@ -2344,21 +3631,37 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_root_item
 		      *item);
-int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_key *key, struct btrfs_root_item
-		      *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
-			 btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
-		      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_key *key,
+				   struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
-int btrfs_set_root_node(struct btrfs_root_item *item,
-			struct extent_buffer *node);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+			 struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
+
+/* uuid-tree.c */
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64));
+
 /* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+			  const char *name, int name_len);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, const char *name,
-			  int name_len, u64 dir,
+			  int name_len, struct inode *dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
@@ -2375,9 +3678,6 @@ struct btrfs_dir_item *
 btrfs_search_dir_index_item(struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dirid,
 			    const char *name, int name_len);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-			      struct btrfs_path *path,
-			      const char *name, int name_len);
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
@@ -2392,6 +3692,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path, u64 dir,
 					  const char *name, u16 name_len,
 					  int mod);
+int verify_dir_item(struct btrfs_root *root,
+		    struct extent_buffer *leaf,
+		    struct btrfs_dir_item *dir_item);
 
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2400,12 +3703,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
 int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
 
-/* inode-map.c */
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *fs_root,
-			     u64 dirid, u64 *objectid);
-int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
-
 /* inode-item.c */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
@@ -2415,12 +3712,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -2428,13 +3719,28 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, struct btrfs_path *path,
 		       struct btrfs_key *location, int mod);
 
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+				   u64 ref_objectid, const char *name,
+				   int name_len,
+				   struct btrfs_inode_extref **extref_ret);
+
 /* file-item.c */
+struct btrfs_dio_private;
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 logical_offset, u32 *dst);
+			      struct btrfs_dio_private *dip, struct bio *bio,
+			      u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -2450,18 +3756,34 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
-			  u64 start, unsigned long len);
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, int cow);
-int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct btrfs_path *path,
-			u64 isize);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
-			     u64 end, struct list_head *list);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list, int search_commit);
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+				     const struct btrfs_path *path,
+				     struct btrfs_file_extent_item *fi,
+				     const bool new_inline,
+				     struct extent_map *em);
+
 /* inode.c */
+struct btrfs_delalloc_work {
+	struct inode *inode;
+	int wait;
+	int delay_iput;
+	struct completion completion;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+					   size_t pg_offset, u64 start, u64 len,
+					   int create);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes);
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
 #if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -2470,6 +3792,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
 #define PageChecked PageFsMisc
 #endif
 
+/* This forces readahead on a given range of bytes in an inode */
+static inline void btrfs_force_ra(struct address_space *mapping,
+				  struct file_ra_state *ra, struct file *file,
+				  pgoff_t offset, unsigned long req_size)
+{
+	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+}
+
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -2483,33 +3813,29 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct inode *dir, u64 objectid,
 			const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
-				   int sync);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
-int btrfs_writepages(struct address_space *mapping,
-		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
-			     u64 new_dirid, u64 alloc_hint);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio, unsigned long bio_flags);
-
-unsigned long btrfs_force_ra(struct address_space *mapping,
-			      struct file_ra_state *ra, struct file *file,
-			      pgoff_t offset, pgoff_t last_index);
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid);
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
-void btrfs_put_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
@@ -2518,26 +3844,20 @@ void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 			 struct btrfs_root *root, int *was_new);
-int btrfs_commit_write(struct file *file, struct page *page,
-		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 end,
+				    size_t pg_offset, u64 start, u64 end,
 				    int create);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
-void btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending,
-				u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t size);
-int btrfs_invalidate_inodes(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -2553,30 +3873,54 @@ extern const struct dentry_operations btrfs_dentry_operations;
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_is_empty_uuid(u8 *uuid);
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space);
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs);
+
 
 /* file.c */
-int btrfs_sync_file(struct file *file, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned);
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+		      struct page **pages, size_t num_pages,
+		      loff_t pos, size_t write_bytes,
+		      struct extent_state **cached);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, int cache_only);
+			struct btrfs_root *root);
 
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
-int btrfs_sysfs_add_root(struct btrfs_root *root);
-void btrfs_sysfs_del_root(struct btrfs_root *root);
-void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
 
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -2584,24 +3928,152 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+#ifdef DEBUG
+#define btrfs_debug(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+    no_printk(KERN_DEBUG fmt, ##args)
+#endif
+
+#ifdef CONFIG_BTRFS_ASSERT
+
+static inline void assfail(char *expr, char *file, int line)
+{
+	pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
+	       expr, file, line);
+	BUG();
+}
+
+#define ASSERT(expr)	\
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr)	((void)0)
+#endif
+
+#define btrfs_assert()
+__printf(5, 6)
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno);
+		     unsigned int line, int errno, const char *fmt, ...);
+
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, const char *function,
+			       unsigned int line, int errno);
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+					   u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & flag)) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			btrfs_info(fs_info, "setting %llu feature flag",
+					 flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+#define btrfs_fs_incompat(fs_info, opt) \
+	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
+#define btrfs_abort_transaction(trans, root, errno)		\
+do {								\
+	__btrfs_abort_transaction(trans, root, __func__,	\
+				  __LINE__, errno);		\
+} while (0)
 
 #define btrfs_std_error(fs_info, errno)				\
 do {								\
 	if ((errno))						\
-		__btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+		__btrfs_std_error((fs_info), __func__,		\
+				   __LINE__, (errno), NULL);	\
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...)		\
+do {								\
+	__btrfs_std_error((fs_info), __func__, __LINE__,	\
+			  (errno), fmt, ##args);		\
+} while (0)
+
+__printf(5, 6)
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...);
+
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...)			\
+do {									\
+	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	BUG();								\
 } while (0)
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
-#else
-#define btrfs_check_acl NULL
-#endif
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir);
-int btrfs_acl_chmod(struct inode *inode);
+#else
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
+				 struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif
 
 /* relocation.c */
 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
@@ -2611,12 +4083,66 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
-void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct extent_buffer *buf,
-			   struct extent_buffer *cow);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow);
 void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve);
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+			   struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress);
+
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
+/* reada.c */
+struct reada_control {
+	struct btrfs_root	*root;		/* tree to prefetch */
+	struct btrfs_key	key_start;
+	struct btrfs_key	key_end;	/* exclusive */
+	atomic_t		elems;
+	struct kref		refcnt;
+	wait_queue_head_t	wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			      struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err);
+
+static inline int is_fstree(u64 rootid)
+{
+	if (rootid == BTRFS_FS_TREE_OBJECTID ||
+	    (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+	return signal_pending(current);
+}
+
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+			       u64 rfer, u64 excl);
+#endif
+
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 00000000000..da775bfdebc
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1980 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "ctree.h"
+
+#define BTRFS_DELAYED_WRITEBACK		512
+#define BTRFS_DELAYED_BACKGROUND	128
+#define BTRFS_DELAYED_BATCH		16
+
+static struct kmem_cache *delayed_node_cache;
+
+int __init btrfs_delayed_inode_init(void)
+{
+	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
+					sizeof(struct btrfs_delayed_node),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!delayed_node_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_delayed_inode_exit(void)
+{
+	if (delayed_node_cache)
+		kmem_cache_destroy(delayed_node_cache);
+}
+
+static inline void btrfs_init_delayed_node(
+				struct btrfs_delayed_node *delayed_node,
+				struct btrfs_root *root, u64 inode_id)
+{
+	delayed_node->root = root;
+	delayed_node->inode_id = inode_id;
+	atomic_set(&delayed_node->refs, 0);
+	delayed_node->count = 0;
+	delayed_node->flags = 0;
+	delayed_node->ins_root = RB_ROOT;
+	delayed_node->del_root = RB_ROOT;
+	mutex_init(&delayed_node->mutex);
+	delayed_node->index_cnt = 0;
+	INIT_LIST_HEAD(&delayed_node->n_list);
+	INIT_LIST_HEAD(&delayed_node->p_list);
+	delayed_node->bytes_reserved = 0;
+	memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
+}
+
+static inline int btrfs_is_continuous_delayed_item(
+					struct btrfs_delayed_item *item1,
+					struct btrfs_delayed_item *item2)
+{
+	if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+	    item1->key.objectid == item2->key.objectid &&
+	    item1->key.type == item2->key.type &&
+	    item1->key.offset + 1 == item2->key.offset)
+		return 1;
+	return 0;
+}
+
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+							struct btrfs_root *root)
+{
+	return root->fs_info->delayed_root;
+}
+
+static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
+{
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
+	u64 ino = btrfs_ino(inode);
+	struct btrfs_delayed_node *node;
+
+	node = ACCESS_ONCE(btrfs_inode->delayed_node);
+	if (node) {
+		atomic_inc(&node->refs);
+		return node;
+	}
+
+	spin_lock(&root->inode_lock);
+	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+	if (node) {
+		if (btrfs_inode->delayed_node) {
+			atomic_inc(&node->refs);	/* can be accessed */
+			BUG_ON(btrfs_inode->delayed_node != node);
+			spin_unlock(&root->inode_lock);
+			return node;
+		}
+		btrfs_inode->delayed_node = node;
+		/* can be accessed and cached in the inode */
+		atomic_add(2, &node->refs);
+		spin_unlock(&root->inode_lock);
+		return node;
+	}
+	spin_unlock(&root->inode_lock);
+
+	return NULL;
+}
+
+/* Will return either the node or PTR_ERR(-ENOMEM) */
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+							struct inode *inode)
+{
+	struct btrfs_delayed_node *node;
+	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+	struct btrfs_root *root = btrfs_inode->root;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+again:
+	node = btrfs_get_delayed_node(inode);
+	if (node)
+		return node;
+
+	node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	btrfs_init_delayed_node(node, root, ino);
+
+	/* cached in the btrfs inode and can be accessed */
+	atomic_add(2, &node->refs);
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret) {
+		kmem_cache_free(delayed_node_cache, node);
+		return ERR_PTR(ret);
+	}
+
+	spin_lock(&root->inode_lock);
+	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+	if (ret == -EEXIST) {
+		spin_unlock(&root->inode_lock);
+		kmem_cache_free(delayed_node_cache, node);
+		radix_tree_preload_end();
+		goto again;
+	}
+	btrfs_inode->delayed_node = node;
+	spin_unlock(&root->inode_lock);
+	radix_tree_preload_end();
+
+	return node;
+}
+
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+				     struct btrfs_delayed_node *node,
+				     int mod)
+{
+	spin_lock(&root->lock);
+	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		if (!list_empty(&node->p_list))
+			list_move_tail(&node->p_list, &root->prepare_list);
+		else if (mod)
+			list_add_tail(&node->p_list, &root->prepare_list);
+	} else {
+		list_add_tail(&node->n_list, &root->node_list);
+		list_add_tail(&node->p_list, &root->prepare_list);
+		atomic_inc(&node->refs);	/* inserted into list */
+		root->nodes++;
+		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+	}
+	spin_unlock(&root->lock);
+}
+
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+				       struct btrfs_delayed_node *node)
+{
+	spin_lock(&root->lock);
+	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		root->nodes--;
+		atomic_dec(&node->refs);	/* not in the list */
+		list_del_init(&node->n_list);
+		if (!list_empty(&node->p_list))
+			list_del_init(&node->p_list);
+		clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+	}
+	spin_unlock(&root->lock);
+}
+
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
+			struct btrfs_delayed_root *delayed_root)
+{
+	struct list_head *p;
+	struct btrfs_delayed_node *node = NULL;
+
+	spin_lock(&delayed_root->lock);
+	if (list_empty(&delayed_root->node_list))
+		goto out;
+
+	p = delayed_root->node_list.next;
+	node = list_entry(p, struct btrfs_delayed_node, n_list);
+	atomic_inc(&node->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return node;
+}
+
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
+						struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct list_head *p;
+	struct btrfs_delayed_node *next = NULL;
+
+	delayed_root = node->root->fs_info->delayed_root;
+	spin_lock(&delayed_root->lock);
+	if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+		/* not in the list */
+		if (list_empty(&delayed_root->node_list))
+			goto out;
+		p = delayed_root->node_list.next;
+	} else if (list_is_last(&node->n_list, &delayed_root->node_list))
+		goto out;
+	else
+		p = node->n_list.next;
+
+	next = list_entry(p, struct btrfs_delayed_node, n_list);
+	atomic_inc(&next->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return next;
+}
+
+static void __btrfs_release_delayed_node(
+				struct btrfs_delayed_node *delayed_node,
+				int mod)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	if (!delayed_node)
+		return;
+
+	delayed_root = delayed_node->root->fs_info->delayed_root;
+
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->count)
+		btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+	else
+		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+	mutex_unlock(&delayed_node->mutex);
+
+	if (atomic_dec_and_test(&delayed_node->refs)) {
+		bool free = false;
+		struct btrfs_root *root = delayed_node->root;
+		spin_lock(&root->inode_lock);
+		if (atomic_read(&delayed_node->refs) == 0) {
+			radix_tree_delete(&root->delayed_nodes_tree,
+					  delayed_node->inode_id);
+			free = true;
+		}
+		spin_unlock(&root->inode_lock);
+		if (free)
+			kmem_cache_free(delayed_node_cache, delayed_node);
+	}
+}
+
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+	__btrfs_release_delayed_node(node, 0);
+}
+
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+					struct btrfs_delayed_root *delayed_root)
+{
+	struct list_head *p;
+	struct btrfs_delayed_node *node = NULL;
+
+	spin_lock(&delayed_root->lock);
+	if (list_empty(&delayed_root->prepare_list))
+		goto out;
+
+	p = delayed_root->prepare_list.next;
+	list_del_init(p);
+	node = list_entry(p, struct btrfs_delayed_node, p_list);
+	atomic_inc(&node->refs);
+out:
+	spin_unlock(&delayed_root->lock);
+
+	return node;
+}
+
+static inline void btrfs_release_prepared_delayed_node(
+					struct btrfs_delayed_node *node)
+{
+	__btrfs_release_delayed_node(node, 1);
+}
+
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+	struct btrfs_delayed_item *item;
+	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+	if (item) {
+		item->data_len = data_len;
+		item->ins_or_del = 0;
+		item->bytes_reserved = 0;
+		item->delayed_node = NULL;
+		atomic_set(&item->refs, 1);
+	}
+	return item;
+}
+
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key:	  the key to look up
+ * @prev:	  used to store the prev item if the right item isn't found
+ * @next:	  used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+				struct rb_root *root,
+				struct btrfs_key *key,
+				struct btrfs_delayed_item **prev,
+				struct btrfs_delayed_item **next)
+{
+	struct rb_node *node, *prev_node = NULL;
+	struct btrfs_delayed_item *delayed_item = NULL;
+	int ret = 0;
+
+	node = root->rb_node;
+
+	while (node) {
+		delayed_item = rb_entry(node, struct btrfs_delayed_item,
+					rb_node);
+		prev_node = node;
+		ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+		if (ret < 0)
+			node = node->rb_right;
+		else if (ret > 0)
+			node = node->rb_left;
+		else
+			return delayed_item;
+	}
+
+	if (prev) {
+		if (!prev_node)
+			*prev = NULL;
+		else if (ret < 0)
+			*prev = delayed_item;
+		else if ((node = rb_prev(prev_node)) != NULL) {
+			*prev = rb_entry(node, struct btrfs_delayed_item,
+					 rb_node);
+		} else
+			*prev = NULL;
+	}
+
+	if (next) {
+		if (!prev_node)
+			*next = NULL;
+		else if (ret > 0)
+			*next = delayed_item;
+		else if ((node = rb_next(prev_node)) != NULL) {
+			*next = rb_entry(node, struct btrfs_delayed_item,
+					 rb_node);
+		} else
+			*next = NULL;
+	}
+	return NULL;
+}
+
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node,
+					struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+					   NULL, NULL);
+	return item;
+}
+
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+				    struct btrfs_delayed_item *ins,
+				    int action)
+{
+	struct rb_node **p, *node;
+	struct rb_node *parent_node = NULL;
+	struct rb_root *root;
+	struct btrfs_delayed_item *item;
+	int cmp;
+
+	if (action == BTRFS_DELAYED_INSERTION_ITEM)
+		root = &delayed_node->ins_root;
+	else if (action == BTRFS_DELAYED_DELETION_ITEM)
+		root = &delayed_node->del_root;
+	else
+		BUG();
+	p = &root->rb_node;
+	node = &ins->rb_node;
+
+	while (*p) {
+		parent_node = *p;
+		item = rb_entry(parent_node, struct btrfs_delayed_item,
+				 rb_node);
+
+		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+		if (cmp < 0)
+			p = &(*p)->rb_right;
+		else if (cmp > 0)
+			p = &(*p)->rb_left;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	ins->delayed_node = delayed_node;
+	ins->ins_or_del = action;
+
+	if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+	    action == BTRFS_DELAYED_INSERTION_ITEM &&
+	    ins->key.offset >= delayed_node->index_cnt)
+			delayed_node->index_cnt = ins->key.offset + 1;
+
+	delayed_node->count++;
+	atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+	return 0;
+}
+
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+					      struct btrfs_delayed_item *item)
+{
+	return __btrfs_add_delayed_item(node, item,
+					BTRFS_DELAYED_INSERTION_ITEM);
+}
+
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+					     struct btrfs_delayed_item *item)
+{
+	return __btrfs_add_delayed_item(node, item,
+					BTRFS_DELAYED_DELETION_ITEM);
+}
+
+static void finish_one_item(struct btrfs_delayed_root *delayed_root)
+{
+	int seq = atomic_inc_return(&delayed_root->items_seq);
+	if ((atomic_dec_return(&delayed_root->items) <
+	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
+	    waitqueue_active(&delayed_root->wait))
+		wake_up(&delayed_root->wait);
+}
+
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+	struct rb_root *root;
+	struct btrfs_delayed_root *delayed_root;
+
+	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+
+	BUG_ON(!delayed_root);
+	BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+	       delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+
+	if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+		root = &delayed_item->delayed_node->ins_root;
+	else
+		root = &delayed_item->delayed_node->del_root;
+
+	rb_erase(&delayed_item->rb_node, root);
+	delayed_item->delayed_node->count--;
+
+	finish_one_item(delayed_root);
+}
+
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+	if (item) {
+		__btrfs_remove_delayed_item(item);
+		if (atomic_dec_and_test(&item->refs))
+			kfree(item);
+	}
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+					struct btrfs_delayed_node *delayed_node)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *item = NULL;
+
+	p = rb_first(&delayed_node->ins_root);
+	if (p)
+		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+					struct btrfs_delayed_node *delayed_node)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *item = NULL;
+
+	p = rb_first(&delayed_node->del_root);
+	if (p)
+		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
+						struct btrfs_delayed_item *item)
+{
+	struct rb_node *p;
+	struct btrfs_delayed_item *next = NULL;
+
+	p = rb_next(&item->rb_node);
+	if (p)
+		next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+	return next;
+}
+
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+					       struct btrfs_root *root,
+					       struct btrfs_delayed_item *item)
+{
+	struct btrfs_block_rsv *src_rsv;
+	struct btrfs_block_rsv *dst_rsv;
+	u64 num_bytes;
+	int ret;
+
+	if (!trans->bytes_reserved)
+		return 0;
+
+	src_rsv = trans->block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+					      item->key.objectid,
+					      num_bytes, 1);
+		item->bytes_reserved = num_bytes;
+	}
+
+	return ret;
+}
+
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+						struct btrfs_delayed_item *item)
+{
+	struct btrfs_block_rsv *rsv;
+
+	if (!item->bytes_reserved)
+		return;
+
+	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+				      item->key.objectid, item->bytes_reserved,
+				      0);
+	btrfs_block_rsv_release(root, rsv,
+				item->bytes_reserved);
+}
+
+static int btrfs_delayed_inode_reserve_metadata(
+					struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct inode *inode,
+					struct btrfs_delayed_node *node)
+{
+	struct btrfs_block_rsv *src_rsv;
+	struct btrfs_block_rsv *dst_rsv;
+	u64 num_bytes;
+	int ret;
+	bool release = false;
+
+	src_rsv = trans->block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+	/*
+	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+	 * which doesn't reserve space for speed.  This is a problem since we
+	 * still need to reserve space for this update, so try to reserve the
+	 * space.
+	 *
+	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+	 * we're accounted for.
+	 */
+	if (!src_rsv || (!trans->bytes_reserved &&
+			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
+		/*
+		 * Since we're under a transaction reserve_metadata_bytes could
+		 * try to commit the transaction which will make it return
+		 * EAGAIN to make us stop the transaction we have, so return
+		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+		 */
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+		if (!ret) {
+			node->bytes_reserved = num_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delayed_inode",
+						      btrfs_ino(inode),
+						      num_bytes, 1);
+		}
+		return ret;
+	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+				       &BTRFS_I(inode)->runtime_flags)) {
+			spin_unlock(&BTRFS_I(inode)->lock);
+			release = true;
+			goto migrate;
+		}
+		spin_unlock(&BTRFS_I(inode)->lock);
+
+		/* Ok we didn't have space pre-reserved.  This shouldn't happen
+		 * too often but it can happen if we do delalloc to an existing
+		 * inode which gets dirtied because of the time update, and then
+		 * isn't touched again until after the transaction commits and
+		 * then we try to write out the data.  First try to be nice and
+		 * reserve something strictly for us.  If not be a pain and try
+		 * to steal from the delalloc block rsv.
+		 */
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
+		if (!ret)
+			goto out;
+
+		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+		if (!WARN_ON(ret))
+			goto out;
+
+		/*
+		 * Ok this is a problem, let's just steal from the global rsv
+		 * since this really shouldn't happen that often.
+		 */
+		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+					      dst_rsv, num_bytes);
+		goto out;
+	}
+
+migrate:
+	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+
+out:
+	/*
+	 * Migrate only takes a reservation, it doesn't touch the size of the
+	 * block_rsv.  This is to simplify people who don't normally have things
+	 * migrated from their block rsv.  If they go to release their
+	 * reservation, that will decrease the size as well, so if migrate
+	 * reduced size we'd end up with a negative size.  But for the
+	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+	 * but we could in fact do this reserve/migrate dance several times
+	 * between the time we did the original reservation and we'd clean it
+	 * up.  So to take care of this, release the space for the meta
+	 * reservation here.  I think it may be time for a documentation page on
+	 * how block rsvs. work.
+	 */
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+					      btrfs_ino(inode), num_bytes, 1);
+		node->bytes_reserved = num_bytes;
+	}
+
+	if (release) {
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 0);
+		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	}
+
+	return ret;
+}
+
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+						struct btrfs_delayed_node *node)
+{
+	struct btrfs_block_rsv *rsv;
+
+	if (!node->bytes_reserved)
+		return;
+
+	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+				      node->inode_id, node->bytes_reserved, 0);
+	btrfs_block_rsv_release(root, rsv,
+				node->bytes_reserved);
+	node->bytes_reserved = 0;
+}
+
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
+{
+	struct btrfs_delayed_item *curr, *next;
+	int free_space;
+	int total_data_size = 0, total_size = 0;
+	struct extent_buffer *leaf;
+	char *data_ptr;
+	struct btrfs_key *keys;
+	u32 *data_size;
+	struct list_head head;
+	int slot;
+	int nitems;
+	int i;
+	int ret = 0;
+
+	BUG_ON(!path->nodes[0]);
+
+	leaf = path->nodes[0];
+	free_space = btrfs_leaf_free_space(root, leaf);
+	INIT_LIST_HEAD(&head);
+
+	next = item;
+	nitems = 0;
+
+	/*
+	 * count the number of the continuous items that we can insert in batch
+	 */
+	while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+	       free_space) {
+		total_data_size += next->data_len;
+		total_size += next->data_len + sizeof(struct btrfs_item);
+		list_add_tail(&next->tree_list, &head);
+		nitems++;
+
+		curr = next;
+		next = __btrfs_next_delayed_item(curr);
+		if (!next)
+			break;
+
+		if (!btrfs_is_continuous_delayed_item(curr, next))
+			break;
+	}
+
+	if (!nitems) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * we need allocate some memory space, but it might cause the task
+	 * to sleep, so we set all locked nodes in the path to blocking locks
+	 * first.
+	 */
+	btrfs_set_path_blocking(path);
+
+	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
+	if (!keys) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
+	if (!data_size) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* get keys of all the delayed items */
+	i = 0;
+	list_for_each_entry(next, &head, tree_list) {
+		keys[i] = next->key;
+		data_size[i] = next->data_len;
+		i++;
+	}
+
+	/* reset all the locked nodes in the patch to spinning locks. */
+	btrfs_clear_path_blocking(path, NULL, 0);
+
+	/* insert the keys of the items */
+	setup_items_for_insert(root, path, keys, data_size,
+			       total_data_size, total_size, nitems);
+
+	/* insert the dir index items */
+	slot = path->slots[0];
+	list_for_each_entry_safe(curr, next, &head, tree_list) {
+		data_ptr = btrfs_item_ptr(leaf, slot, char);
+		write_extent_buffer(leaf, &curr->data,
+				    (unsigned long)data_ptr,
+				    curr->data_len);
+		slot++;
+
+		btrfs_delayed_item_release_metadata(root, curr);
+
+		list_del(&curr->tree_list);
+		btrfs_release_delayed_item(curr);
+	}
+
+error:
+	kfree(data_size);
+	kfree(keys);
+out:
+	return ret;
+}
+
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct btrfs_delayed_item *delayed_item)
+{
+	struct extent_buffer *leaf;
+	char *ptr;
+	int ret;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+				      delayed_item->data_len);
+	if (ret < 0 && ret != -EEXIST)
+		return ret;
+
+	leaf = path->nodes[0];
+
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+
+	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+			    delayed_item->data_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_delayed_item_release_metadata(root, delayed_item);
+	return 0;
+}
+
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+				      struct btrfs_path *path,
+				      struct btrfs_root *root,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_item *curr, *prev;
+	int ret = 0;
+
+do_again:
+	mutex_lock(&node->mutex);
+	curr = __btrfs_first_delayed_insertion_item(node);
+	if (!curr)
+		goto insert_end;
+
+	ret = btrfs_insert_delayed_item(trans, root, path, curr);
+	if (ret < 0) {
+		btrfs_release_path(path);
+		goto insert_end;
+	}
+
+	prev = curr;
+	curr = __btrfs_next_delayed_item(prev);
+	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+		/* insert the continuous items into the same leaf */
+		path->slots[0]++;
+		btrfs_batch_insert_items(root, path, curr);
+	}
+	btrfs_release_delayed_item(prev);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	goto do_again;
+
+insert_end:
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
+{
+	struct btrfs_delayed_item *curr, *next;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct list_head head;
+	int nitems, i, last_item;
+	int ret = 0;
+
+	BUG_ON(!path->nodes[0]);
+
+	leaf = path->nodes[0];
+
+	i = path->slots[0];
+	last_item = btrfs_header_nritems(leaf) - 1;
+	if (i > last_item)
+		return -ENOENT;	/* FIXME: Is errno suitable? */
+
+	next = item;
+	INIT_LIST_HEAD(&head);
+	btrfs_item_key_to_cpu(leaf, &key, i);
+	nitems = 0;
+	/*
+	 * count the number of the dir index items that we can delete in batch
+	 */
+	while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+		list_add_tail(&next->tree_list, &head);
+		nitems++;
+
+		curr = next;
+		next = __btrfs_next_delayed_item(curr);
+		if (!next)
+			break;
+
+		if (!btrfs_is_continuous_delayed_item(curr, next))
+			break;
+
+		i++;
+		if (i > last_item)
+			break;
+		btrfs_item_key_to_cpu(leaf, &key, i);
+	}
+
+	if (!nitems)
+		return 0;
+
+	ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+	if (ret)
+		goto out;
+
+	list_for_each_entry_safe(curr, next, &head, tree_list) {
+		btrfs_delayed_item_release_metadata(root, curr);
+		list_del(&curr->tree_list);
+		btrfs_release_delayed_item(curr);
+	}
+
+out:
+	return ret;
+}
+
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+				      struct btrfs_path *path,
+				      struct btrfs_root *root,
+				      struct btrfs_delayed_node *node)
+{
+	struct btrfs_delayed_item *curr, *prev;
+	int ret = 0;
+
+do_again:
+	mutex_lock(&node->mutex);
+	curr = __btrfs_first_delayed_deletion_item(node);
+	if (!curr)
+		goto delete_fail;
+
+	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+	if (ret < 0)
+		goto delete_fail;
+	else if (ret > 0) {
+		/*
+		 * can't find the item which the node points to, so this node
+		 * is invalid, just drop it.
+		 */
+		prev = curr;
+		curr = __btrfs_next_delayed_item(prev);
+		btrfs_release_delayed_item(prev);
+		ret = 0;
+		btrfs_release_path(path);
+		if (curr) {
+			mutex_unlock(&node->mutex);
+			goto do_again;
+		} else
+			goto delete_fail;
+	}
+
+	btrfs_batch_delete_items(trans, root, path, curr);
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	goto do_again;
+
+delete_fail:
+	btrfs_release_path(path);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	if (delayed_node &&
+	    test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		BUG_ON(!delayed_node->root);
+		clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+		delayed_node->count--;
+
+		delayed_root = delayed_node->root->fs_info->delayed_root;
+		finish_one_item(delayed_root);
+	}
+}
+
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	ASSERT(delayed_node->root);
+	clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+	delayed_node->count--;
+
+	delayed_root = delayed_node->root->fs_info->delayed_root;
+	finish_one_item(delayed_root);
+}
+
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_delayed_node *node)
+{
+	struct btrfs_key key;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	int mod;
+	int ret;
+
+	key.objectid = node->inode_id;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+		mod = -1;
+	else
+		mod = 1;
+
+	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+	if (ret > 0) {
+		btrfs_release_path(path);
+		return -ENOENT;
+	} else if (ret < 0) {
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+			    sizeof(struct btrfs_inode_item));
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+		goto no_iref;
+
+	path->slots[0]++;
+	if (path->slots[0] >= btrfs_header_nritems(leaf))
+		goto search;
+again:
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (key.objectid != node->inode_id)
+		goto out;
+
+	if (key.type != BTRFS_INODE_REF_KEY &&
+	    key.type != BTRFS_INODE_EXTREF_KEY)
+		goto out;
+
+	/*
+	 * Delayed iref deletion is for the inode who has only one link,
+	 * so there is only one iref. The case that several irefs are
+	 * in the same item doesn't exist.
+	 */
+	btrfs_del_item(trans, root, path);
+out:
+	btrfs_release_delayed_iref(node);
+no_iref:
+	btrfs_release_path(path);
+err_out:
+	btrfs_delayed_inode_release_metadata(root, node);
+	btrfs_release_delayed_inode(node);
+
+	return ret;
+
+search:
+	btrfs_release_path(path);
+
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = -1;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto err_out;
+	ASSERT(ret);
+
+	ret = 0;
+	leaf = path->nodes[0];
+	path->slots[0]--;
+	goto again;
+}
+
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path,
+					     struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	mutex_lock(&node->mutex);
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
+		mutex_unlock(&node->mutex);
+		return 0;
+	}
+
+	ret = __btrfs_update_delayed_inode(trans, root, path, node);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path,
+				   struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+	return ret;
+}
+
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, int nr)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret = 0;
+	bool count = (nr > 0);
+
+	if (trans->aborted)
+		return -EIO;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	curr_node = btrfs_first_delayed_node(delayed_root);
+	while (curr_node && (!count || (count && nr--))) {
+		ret = __btrfs_commit_inode_delayed_items(trans, path,
+							 curr_node);
+		if (ret) {
+			btrfs_release_delayed_node(curr_node);
+			curr_node = NULL;
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
+
+		prev_node = curr_node;
+		curr_node = btrfs_next_delayed_node(curr_node);
+		btrfs_release_delayed_node(prev_node);
+	}
+
+	if (curr_node)
+		btrfs_release_delayed_node(curr_node);
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+
+	return ret;
+}
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	return __btrfs_run_delayed_items(trans, root, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, int nr)
+{
+	return __btrfs_run_delayed_items(trans, root, nr);
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!delayed_node->count) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		btrfs_release_delayed_node(delayed_node);
+		return -ENOMEM;
+	}
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+	btrfs_release_delayed_node(delayed_node);
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+
+	return ret;
+}
+
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+	trans = btrfs_join_transaction(delayed_node->root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto trans_out;
+	}
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
+		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+						   path, delayed_node);
+	else
+		ret = 0;
+	mutex_unlock(&delayed_node->mutex);
+
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+trans_out:
+	btrfs_end_transaction(trans, delayed_node->root);
+	btrfs_btree_balance_dirty(delayed_node->root);
+out:
+	btrfs_release_delayed_node(delayed_node);
+
+	return ret;
+}
+
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+	if (!delayed_node)
+		return;
+
+	BTRFS_I(inode)->delayed_node = NULL;
+	btrfs_release_delayed_node(delayed_node);
+}
+
+struct btrfs_async_delayed_work {
+	struct btrfs_delayed_root *delayed_root;
+	int nr;
+	struct btrfs_work work;
+};
+
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
+{
+	struct btrfs_async_delayed_work *async_work;
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_delayed_node *delayed_node = NULL;
+	struct btrfs_root *root;
+	struct btrfs_block_rsv *block_rsv;
+	int total_done = 0;
+
+	async_work = container_of(work, struct btrfs_async_delayed_work, work);
+	delayed_root = async_work->delayed_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+
+again:
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
+		goto free_path;
+
+	delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+	if (!delayed_node)
+		goto free_path;
+
+	path->leave_spinning = 1;
+	root = delayed_node->root;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		goto release_path;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+	trans->block_rsv = block_rsv;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty_nodelay(root);
+
+release_path:
+	btrfs_release_path(path);
+	total_done++;
+
+	btrfs_release_prepared_delayed_node(delayed_node);
+	if (async_work->nr == 0 || total_done < async_work->nr)
+		goto again;
+
+free_path:
+	btrfs_free_path(path);
+out:
+	wake_up(&delayed_root->wait);
+	kfree(async_work);
+}
+
+
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+				     struct btrfs_root *root, int nr)
+{
+	struct btrfs_async_delayed_work *async_work;
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return 0;
+
+	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
+	if (!async_work)
+		return -ENOMEM;
+
+	async_work->delayed_root = delayed_root;
+	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+			NULL, NULL);
+	async_work->nr = nr;
+
+	btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
+	return 0;
+}
+
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	delayed_root = btrfs_get_delayed_root(root);
+	WARN_ON(btrfs_first_delayed_node(delayed_root));
+}
+
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+{
+	int val = atomic_read(&delayed_root->items_seq);
+
+	if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
+		return 1;
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return 1;
+
+	return 0;
+}
+
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+		return;
+
+	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+		int seq;
+		int ret;
+
+		seq = atomic_read(&delayed_root->items_seq);
+
+		ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
+		if (ret)
+			return;
+
+		wait_event_interruptible(delayed_root->wait,
+					 could_end_wait(delayed_root, seq));
+		return;
+	}
+
+	btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
+}
+
+/* Will return 0 or -ENOMEM */
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, const char *name,
+				   int name_len, struct inode *dir,
+				   struct btrfs_disk_key *disk_key, u8 type,
+				   u64 index)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_delayed_item *delayed_item;
+	struct btrfs_dir_item *dir_item;
+	int ret;
+
+	delayed_node = btrfs_get_or_create_delayed_node(dir);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+	if (!delayed_item) {
+		ret = -ENOMEM;
+		goto release_node;
+	}
+
+	delayed_item->key.objectid = btrfs_ino(dir);
+	btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+	delayed_item->key.offset = index;
+
+	dir_item = (struct btrfs_dir_item *)delayed_item->data;
+	dir_item->location = *disk_key;
+	btrfs_set_stack_dir_transid(dir_item, trans->transid);
+	btrfs_set_stack_dir_data_len(dir_item, 0);
+	btrfs_set_stack_dir_name_len(dir_item, name_len);
+	btrfs_set_stack_dir_type(dir_item, type);
+	memcpy((char *)(dir_item + 1), name, name_len);
+
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+
+	mutex_lock(&delayed_node->mutex);
+	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+	if (unlikely(ret)) {
+		btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
+				"into the insertion tree of the delayed node"
+				"(root id: %llu, inode id: %llu, errno: %d)",
+				name_len, name, delayed_node->root->objectid,
+				delayed_node->inode_id, ret);
+		BUG();
+	}
+	mutex_unlock(&delayed_node->mutex);
+
+release_node:
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+					       struct btrfs_delayed_node *node,
+					       struct btrfs_key *key)
+{
+	struct btrfs_delayed_item *item;
+
+	mutex_lock(&node->mutex);
+	item = __btrfs_lookup_delayed_insertion_item(node, key);
+	if (!item) {
+		mutex_unlock(&node->mutex);
+		return 1;
+	}
+
+	btrfs_delayed_item_release_metadata(root, item);
+	btrfs_release_delayed_item(item);
+	mutex_unlock(&node->mutex);
+	return 0;
+}
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct inode *dir,
+				   u64 index)
+{
+	struct btrfs_delayed_node *node;
+	struct btrfs_delayed_item *item;
+	struct btrfs_key item_key;
+	int ret;
+
+	node = btrfs_get_or_create_delayed_node(dir);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	item_key.objectid = btrfs_ino(dir);
+	btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+	item_key.offset = index;
+
+	ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+	if (!ret)
+		goto end;
+
+	item = btrfs_alloc_delayed_item(0);
+	if (!item) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	item->key = item_key;
+
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible.
+	 */
+	BUG_ON(ret);
+
+	mutex_lock(&node->mutex);
+	ret = __btrfs_add_delayed_deletion_item(node, item);
+	if (unlikely(ret)) {
+		btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
+				"into the deletion tree of the delayed node"
+				"(root id: %llu, inode id: %llu, errno: %d)",
+				index, node->root->objectid, node->inode_id,
+				ret);
+		BUG();
+	}
+	mutex_unlock(&node->mutex);
+end:
+	btrfs_release_delayed_node(node);
+	return ret;
+}
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+
+	if (!delayed_node)
+		return -ENOENT;
+
+	/*
+	 * Since we have held i_mutex of this directory, it is impossible that
+	 * a new directory index is added into the delayed node and index_cnt
+	 * is updated now. So we needn't lock the delayed node.
+	 */
+	if (!delayed_node->index_cnt) {
+		btrfs_release_delayed_node(delayed_node);
+		return -EINVAL;
+	}
+
+	BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+			     struct list_head *del_list)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_delayed_item *item;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return;
+
+	mutex_lock(&delayed_node->mutex);
+	item = __btrfs_first_delayed_insertion_item(delayed_node);
+	while (item) {
+		atomic_inc(&item->refs);
+		list_add_tail(&item->readdir_list, ins_list);
+		item = __btrfs_next_delayed_item(item);
+	}
+
+	item = __btrfs_first_delayed_deletion_item(delayed_node);
+	while (item) {
+		atomic_inc(&item->refs);
+		list_add_tail(&item->readdir_list, del_list);
+		item = __btrfs_next_delayed_item(item);
+	}
+	mutex_unlock(&delayed_node->mutex);
+	/*
+	 * This delayed node is still cached in the btrfs inode, so refs
+	 * must be > 1 now, and we needn't check it is going to be freed
+	 * or not.
+	 *
+	 * Besides that, this function is used to read dir, we do not
+	 * insert/delete delayed items in this period. So we also needn't
+	 * requeue or dequeue this delayed node.
+	 */
+	atomic_dec(&delayed_node->refs);
+}
+
+void btrfs_put_delayed_items(struct list_head *ins_list,
+			     struct list_head *del_list)
+{
+	struct btrfs_delayed_item *curr, *next;
+
+	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+		list_del(&curr->readdir_list);
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+	}
+
+	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+		list_del(&curr->readdir_list);
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+	}
+}
+
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+				  u64 index)
+{
+	struct btrfs_delayed_item *curr, *next;
+	int ret;
+
+	if (list_empty(del_list))
+		return 0;
+
+	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+		if (curr->key.offset > index)
+			break;
+
+		list_del(&curr->readdir_list);
+		ret = (curr->key.offset == index);
+
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+
+		if (ret)
+			return 1;
+		else
+			continue;
+	}
+	return 0;
+}
+
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+				    struct list_head *ins_list)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_delayed_item *curr, *next;
+	struct btrfs_key location;
+	char *name;
+	int name_len;
+	int over = 0;
+	unsigned char d_type;
+
+	if (list_empty(ins_list))
+		return 0;
+
+	/*
+	 * Changing the data of the delayed item is impossible. So
+	 * we needn't lock them. And we have held i_mutex of the
+	 * directory, nobody can delete any directory indexes now.
+	 */
+	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+		list_del(&curr->readdir_list);
+
+		if (curr->key.offset < ctx->pos) {
+			if (atomic_dec_and_test(&curr->refs))
+				kfree(curr);
+			continue;
+		}
+
+		ctx->pos = curr->key.offset;
+
+		di = (struct btrfs_dir_item *)curr->data;
+		name = (char *)(di + 1);
+		name_len = btrfs_stack_dir_name_len(di);
+
+		d_type = btrfs_filetype_table[di->type];
+		btrfs_disk_key_to_cpu(&location, &di->location);
+
+		over = !dir_emit(ctx, name, name_len,
+			       location.objectid, d_type);
+
+		if (atomic_dec_and_test(&curr->refs))
+			kfree(curr);
+
+		if (over)
+			return 1;
+	}
+	return 0;
+}
+
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+				  struct btrfs_inode_item *inode_item,
+				  struct inode *inode)
+{
+	btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
+	btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
+	btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+	btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+	btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+	btrfs_set_stack_inode_generation(inode_item,
+					 BTRFS_I(inode)->generation);
+	btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
+	btrfs_set_stack_inode_transid(inode_item, trans->transid);
+	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+	btrfs_set_stack_inode_block_group(inode_item, 0);
+
+	btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+				     inode->i_atime.tv_sec);
+	btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+				      inode->i_atime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+				     inode->i_mtime.tv_sec);
+	btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+				      inode->i_mtime.tv_nsec);
+
+	btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+				     inode->i_ctime.tv_sec);
+	btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+				      inode->i_ctime.tv_nsec);
+}
+
+int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+{
+	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_timespec *tspec;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return -ENOENT;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return -ENOENT;
+	}
+
+	inode_item = &delayed_node->inode_item;
+
+	i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
+	i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
+	btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+	inode->i_mode = btrfs_stack_inode_mode(inode_item);
+	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
+	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
+	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+	inode->i_version = btrfs_stack_inode_sequence(inode_item);
+	inode->i_rdev = 0;
+	*rdev = btrfs_stack_inode_rdev(inode_item);
+	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+
+	tspec = btrfs_inode_atime(inode_item);
+	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
+	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
+	tspec = btrfs_inode_mtime(inode_item);
+	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
+	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
+	tspec = btrfs_inode_ctime(inode_item);
+	inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
+	inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
+	inode->i_generation = BTRFS_I(inode)->generation;
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+	int ret = 0;
+
+	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+		goto release_node;
+	}
+
+	ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+						   delayed_node);
+	if (ret)
+		goto release_node;
+
+	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+	set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+	delayed_node->count++;
+	atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return ret;
+}
+
+int btrfs_delayed_delete_inode_ref(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	if (IS_ERR(delayed_node))
+		return PTR_ERR(delayed_node);
+
+	/*
+	 * We don't reserve space for inode ref deletion is because:
+	 * - We ONLY do async inode ref deletion for the inode who has only
+	 *   one link(i_nlink == 1), it means there is only one inode ref.
+	 *   And in most case, the inode ref and the inode item are in the
+	 *   same leaf, and we will deal with them at the same time.
+	 *   Since we are sure we will reserve the space for the inode item,
+	 *   it is unnecessary to reserve space for inode ref deletion.
+	 * - If the inode ref and the inode item are not in the same leaf,
+	 *   We also needn't worry about enospc problem, because we reserve
+	 *   much more space for the inode update than it needs.
+	 * - At the worst, we can steal some space from the global reservation.
+	 *   It is very rare.
+	 */
+	mutex_lock(&delayed_node->mutex);
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+		goto release_node;
+
+	set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+	delayed_node->count++;
+	atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+release_node:
+	mutex_unlock(&delayed_node->mutex);
+	btrfs_release_delayed_node(delayed_node);
+	return 0;
+}
+
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+	struct btrfs_root *root = delayed_node->root;
+	struct btrfs_delayed_item *curr_item, *prev_item;
+
+	mutex_lock(&delayed_node->mutex);
+	curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+	while (curr_item) {
+		btrfs_delayed_item_release_metadata(root, curr_item);
+		prev_item = curr_item;
+		curr_item = __btrfs_next_delayed_item(prev_item);
+		btrfs_release_delayed_item(prev_item);
+	}
+
+	curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+	while (curr_item) {
+		btrfs_delayed_item_release_metadata(root, curr_item);
+		prev_item = curr_item;
+		curr_item = __btrfs_next_delayed_item(prev_item);
+		btrfs_release_delayed_item(prev_item);
+	}
+
+	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+		btrfs_release_delayed_iref(delayed_node);
+
+	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+		btrfs_delayed_inode_release_metadata(root, delayed_node);
+		btrfs_release_delayed_inode(delayed_node);
+	}
+	mutex_unlock(&delayed_node->mutex);
+}
+
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node;
+
+	delayed_node = btrfs_get_delayed_node(inode);
+	if (!delayed_node)
+		return;
+
+	__btrfs_kill_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node);
+}
+
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+	u64 inode_id = 0;
+	struct btrfs_delayed_node *delayed_nodes[8];
+	int i, n;
+
+	while (1) {
+		spin_lock(&root->inode_lock);
+		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+					   (void **)delayed_nodes, inode_id,
+					   ARRAY_SIZE(delayed_nodes));
+		if (!n) {
+			spin_unlock(&root->inode_lock);
+			break;
+		}
+
+		inode_id = delayed_nodes[n - 1]->inode_id + 1;
+
+		for (i = 0; i < n; i++)
+			atomic_inc(&delayed_nodes[i]->refs);
+		spin_unlock(&root->inode_lock);
+
+		for (i = 0; i < n; i++) {
+			__btrfs_kill_delayed_node(delayed_nodes[i]);
+			btrfs_release_delayed_node(delayed_nodes[i]);
+		}
+	}
+}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_delayed_node *curr_node, *prev_node;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	curr_node = btrfs_first_delayed_node(delayed_root);
+	while (curr_node) {
+		__btrfs_kill_delayed_node(curr_node);
+
+		prev_node = curr_node;
+		curr_node = btrfs_next_delayed_node(curr_node);
+		btrfs_release_delayed_node(prev_node);
+	}
+}
+
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 00000000000..f70119f2542
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/atomic.h>
+
+#include "ctree.h"
+
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM	1
+#define BTRFS_DELAYED_DELETION_ITEM	2
+
+struct btrfs_delayed_root {
+	spinlock_t lock;
+	struct list_head node_list;
+	/*
+	 * Used for delayed nodes which is waiting to be dealt with by the
+	 * worker. If the delayed node is inserted into the work queue, we
+	 * drop it from this list.
+	 */
+	struct list_head prepare_list;
+	atomic_t items;		/* for delayed items */
+	atomic_t items_seq;	/* for delayed items */
+	int nodes;		/* for delayed nodes */
+	wait_queue_head_t wait;
+};
+
+#define BTRFS_DELAYED_NODE_IN_LIST	0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY	1
+#define BTRFS_DELAYED_NODE_DEL_IREF	2
+
+struct btrfs_delayed_node {
+	u64 inode_id;
+	u64 bytes_reserved;
+	struct btrfs_root *root;
+	/* Used to add the node into the delayed root's node list. */
+	struct list_head n_list;
+	/*
+	 * Used to add the node into the prepare list, the nodes in this list
+	 * is waiting to be dealt with by the async worker.
+	 */
+	struct list_head p_list;
+	struct rb_root ins_root;
+	struct rb_root del_root;
+	struct mutex mutex;
+	struct btrfs_inode_item inode_item;
+	atomic_t refs;
+	u64 index_cnt;
+	unsigned long flags;
+	int count;
+};
+
+struct btrfs_delayed_item {
+	struct rb_node rb_node;
+	struct btrfs_key key;
+	struct list_head tree_list;	/* used for batch insert/delete items */
+	struct list_head readdir_list;	/* used for readdir items */
+	u64 bytes_reserved;
+	struct btrfs_delayed_node *delayed_node;
+	atomic_t refs;
+	int ins_or_del;
+	u32 data_len;
+	char data[0];
+};
+
+static inline void btrfs_init_delayed_root(
+				struct btrfs_delayed_root *delayed_root)
+{
+	atomic_set(&delayed_root->items, 0);
+	atomic_set(&delayed_root->items_seq, 0);
+	delayed_root->nodes = 0;
+	spin_lock_init(&delayed_root->lock);
+	init_waitqueue_head(&delayed_root->wait);
+	INIT_LIST_HEAD(&delayed_root->node_list);
+	INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, const char *name,
+				   int name_len, struct inode *dir,
+				   struct btrfs_disk_key *disk_key, u8 type,
+				   u64 index);
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct inode *dir,
+				   u64 index);
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, int nr);
+
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
+
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, struct inode *inode);
+int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct inode *inode);
+
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+			     struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+			     struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+				  u64 index);
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+				    struct list_head *ins_list);
+
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b85..6d16bea94e1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
 #include "delayed-ref.h"
 #include "transaction.h"
 
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
  * delayed back reference update tracking.  For subvolume trees
  * we queue up extent allocations and backref maintenance for
@@ -36,9 +40,9 @@
  * compare two delayed tree backrefs with same bytenr and type
  */
 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
-			  struct btrfs_delayed_tree_ref *ref1)
+			  struct btrfs_delayed_tree_ref *ref1, int type)
 {
-	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+	if (type == BTRFS_TREE_BLOCK_REF_KEY) {
 		if (ref1->root < ref2->root)
 			return -1;
 		if (ref1->root > ref2->root)
@@ -85,7 +89,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
  * type of the delayed backrefs and content of delayed backrefs.
  */
 static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1)
+		      struct btrfs_delayed_ref_node *ref1,
+		      bool compare_seq)
 {
 	if (ref1->bytenr < ref2->bytenr)
 		return -1;
@@ -101,10 +106,22 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 		return -1;
 	if (ref1->type > ref2->type)
 		return 1;
+	if (ref1->no_quota > ref2->no_quota)
+		return 1;
+	if (ref1->no_quota < ref2->no_quota)
+		return -1;
+	/* merging of sequenced refs is not allowed */
+	if (compare_seq) {
+		if (ref1->seq < ref2->seq)
+			return -1;
+		if (ref1->seq > ref2->seq)
+			return 1;
+	}
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
-				      btrfs_delayed_node_to_tree_ref(ref1));
+				      btrfs_delayed_node_to_tree_ref(ref1),
+				      ref1->type);
 	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
 		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
 		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
@@ -134,7 +151,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, ins);
+		cmp = comp_entry(entry, ins, 1);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -148,40 +165,72 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 	return NULL;
 }
 
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+						   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_head *entry;
+	struct btrfs_delayed_ref_head *ins;
+	u64 bytenr;
+
+	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+	bytenr = ins->node.bytenr;
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+				 href_node);
+
+		if (bytenr < entry->node.bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->node.bytenr)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
-static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
-				  u64 bytenr,
-				  struct btrfs_delayed_ref_node **last)
+static struct btrfs_delayed_ref_head *
+find_ref_head(struct rb_root *root, u64 bytenr,
+	      int return_bigger)
 {
-	struct rb_node *n = root->rb_node;
-	struct btrfs_delayed_ref_node *entry;
-	int cmp;
+	struct rb_node *n;
+	struct btrfs_delayed_ref_head *entry;
 
+	n = root->rb_node;
+	entry = NULL;
 	while (n) {
-		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
-		WARN_ON(!entry->in_tree);
-		if (last)
-			*last = entry;
-
-		if (bytenr < entry->bytenr)
-			cmp = -1;
-		else if (bytenr > entry->bytenr)
-			cmp = 1;
-		else if (!btrfs_delayed_ref_is_head(entry))
-			cmp = 1;
-		else
-			cmp = 0;
+		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
 
-		if (cmp < 0)
+		if (bytenr < entry->node.bytenr)
 			n = n->rb_left;
-		else if (cmp > 0)
+		else if (bytenr > entry->node.bytenr)
 			n = n->rb_right;
 		else
 			return entry;
 	}
+	if (entry && return_bigger) {
+		if (bytenr > entry->node.bytenr) {
+			n = rb_next(&entry->href_node);
+			if (!n)
+				n = rb_first(root);
+			entry = rb_entry(n, struct btrfs_delayed_ref_head,
+					 href_node);
+			return entry;
+		}
+		return entry;
+	}
 	return NULL;
 }
 
@@ -209,115 +258,190 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-			   struct list_head *cluster, u64 start)
+static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_root *delayed_refs,
+				    struct btrfs_delayed_ref_head *head,
+				    struct btrfs_delayed_ref_node *ref)
 {
-	int count = 0;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *head;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-	if (start == 0) {
-		node = rb_first(&delayed_refs->root);
+	if (btrfs_delayed_ref_is_head(ref)) {
+		head = btrfs_delayed_node_to_head(ref);
+		rb_erase(&head->href_node, &delayed_refs->href_root);
 	} else {
-		ref = NULL;
-		find_ref_head(&delayed_refs->root, start, &ref);
-		if (ref) {
-			struct btrfs_delayed_ref_node *tmp;
-
-			node = rb_prev(&ref->rb_node);
-			while (node) {
-				tmp = rb_entry(node,
-					       struct btrfs_delayed_ref_node,
-					       rb_node);
-				if (tmp->bytenr < start)
-					break;
-				ref = tmp;
-				node = rb_prev(&ref->rb_node);
-			}
-			node = &ref->rb_node;
-		} else
-			node = rb_first(&delayed_refs->root);
+		assert_spin_locked(&head->lock);
+		rb_erase(&ref->rb_node, &head->ref_root);
 	}
-again:
-	while (node && count < 32) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (btrfs_delayed_ref_is_head(ref)) {
-			head = btrfs_delayed_node_to_head(ref);
-			if (list_empty(&head->cluster)) {
-				list_add_tail(&head->cluster, cluster);
-				delayed_refs->run_delayed_start =
-					head->node.bytenr;
-				count++;
-
-				WARN_ON(delayed_refs->num_heads_ready == 0);
-				delayed_refs->num_heads_ready--;
-			} else if (count) {
-				/* the goal of the clustering is to find extents
-				 * that are likely to end up in the same extent
-				 * leaf on disk.  So, we don't want them spread
-				 * all over the tree.  Stop now if we've hit
-				 * a head that was already in use
-				 */
-				break;
+	ref->in_tree = 0;
+	btrfs_put_delayed_ref(ref);
+	atomic_dec(&delayed_refs->num_entries);
+	if (trans->delayed_ref_updates)
+		trans->delayed_ref_updates--;
+}
+
+static int merge_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_root *delayed_refs,
+		     struct btrfs_delayed_ref_head *head,
+		     struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+	struct rb_node *node;
+	int mod = 0;
+	int done = 0;
+
+	node = rb_next(&ref->rb_node);
+	while (!done && node) {
+		struct btrfs_delayed_ref_node *next;
+
+		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
+		if (seq && next->seq >= seq)
+			break;
+		if (comp_entry(ref, next, 0))
+			continue;
+
+		if (ref->action == next->action) {
+			mod = next->ref_mod;
+		} else {
+			if (ref->ref_mod < next->ref_mod) {
+				struct btrfs_delayed_ref_node *tmp;
+
+				tmp = ref;
+				ref = next;
+				next = tmp;
+				done = 1;
 			}
+			mod = -next->ref_mod;
+		}
+
+		drop_delayed_ref(trans, delayed_refs, head, next);
+		ref->ref_mod += mod;
+		if (ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, head, ref);
+			done = 1;
+		} else {
+			/*
+			 * You can't have multiples of the same ref on a tree
+			 * block.
+			 */
+			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		}
-		node = rb_next(node);
-	}
-	if (count) {
-		return 0;
-	} else if (start) {
-		/*
-		 * we've gone to the end of the rbtree without finding any
-		 * clusters.  start from the beginning and try again
-		 */
-		start = 0;
-		node = rb_first(&delayed_refs->root);
-		goto again;
 	}
-	return 1;
+	return done;
 }
 
-/*
- * This checks to see if there are any delayed refs in the
- * btree for a given bytenr.  It returns one if it finds any
- * and zero otherwise.
- *
- * If it only finds a head node, it returns 0.
- *
- * The idea is to use this when deciding if you can safely delete an
- * extent from the extent allocation tree.  There may be a pending
- * ref in the rbtree that adds or removes references, so as long as this
- * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
- * allocation tree.
- */
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head)
 {
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *prev_node;
-	int ret = 0;
+	struct rb_node *node;
+	u64 seq = 0;
 
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
+	assert_spin_locked(&head->lock);
+	/*
+	 * We don't have too much refs to merge in the case of delayed data
+	 * refs.
+	 */
+	if (head->is_data)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
 
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-	if (ref) {
-		prev_node = rb_prev(&ref->rb_node);
-		if (!prev_node)
-			goto out;
-		ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+	node = rb_first(&head->ref_root);
+	while (node) {
+		struct btrfs_delayed_ref_node *ref;
+
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
 			       rb_node);
-		if (ref->bytenr == bytenr)
+		/* We can't merge refs that are outside of our seq count */
+		if (seq && ref->seq >= seq)
+			break;
+		if (merge_ref(trans, delayed_refs, head, ref, seq))
+			node = rb_first(&head->ref_root);
+		else
+			node = rb_next(&ref->rb_node);
+	}
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq)
+{
+	struct seq_list *elem;
+	int ret = 0;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		if (seq >= elem->seq) {
+			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+				 (u32)(seq >> 32), (u32)seq,
+				 (u32)(elem->seq >> 32), (u32)elem->seq,
+				 delayed_refs);
 			ret = 1;
+		}
 	}
-out:
-	spin_unlock(&delayed_refs->lock);
+
+	spin_unlock(&fs_info->tree_mod_seq_lock);
 	return ret;
 }
 
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *head;
+	u64 start;
+	bool loop = false;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+again:
+	start = delayed_refs->run_delayed_start;
+	head = find_ref_head(&delayed_refs->href_root, start, 1);
+	if (!head && !loop) {
+		delayed_refs->run_delayed_start = 0;
+		start = 0;
+		loop = true;
+		head = find_ref_head(&delayed_refs->href_root, start, 1);
+		if (!head)
+			return NULL;
+	} else if (!head && loop) {
+		return NULL;
+	}
+
+	while (head->processing) {
+		struct rb_node *node;
+
+		node = rb_next(&head->href_node);
+		if (!node) {
+			if (loop)
+				return NULL;
+			delayed_refs->run_delayed_start = 0;
+			start = 0;
+			loop = true;
+			goto again;
+		}
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+	}
+
+	head->processing = 1;
+	WARN_ON(delayed_refs->num_heads_ready == 0);
+	delayed_refs->num_heads_ready--;
+	delayed_refs->run_delayed_start = head->node.bytenr +
+		head->node.num_bytes;
+	return head;
+}
+
 /*
  * helper function to update an extent delayed ref in the
  * rbtree.  existing and update must both have the same
@@ -329,6 +453,7 @@ out:
 static noinline void
 update_existing_ref(struct btrfs_trans_handle *trans,
 		    struct btrfs_delayed_ref_root *delayed_refs,
+		    struct btrfs_delayed_ref_head *head,
 		    struct btrfs_delayed_ref_node *existing,
 		    struct btrfs_delayed_ref_node *update)
 {
@@ -340,18 +465,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 		 * every changing the extent allocation tree.
 		 */
 		existing->ref_mod--;
-		if (existing->ref_mod == 0) {
-			rb_erase(&existing->rb_node,
-				 &delayed_refs->root);
-			existing->in_tree = 0;
-			btrfs_put_delayed_ref(existing);
-			delayed_refs->num_entries--;
-			if (trans->delayed_ref_updates)
-				trans->delayed_ref_updates--;
-		} else {
+		if (existing->ref_mod == 0)
+			drop_delayed_ref(trans, delayed_refs, head, existing);
+		else
 			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		}
 	} else {
 		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -381,6 +499,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 	ref = btrfs_delayed_node_to_head(update);
 	BUG_ON(existing_ref->is_data != ref->is_data);
 
+	spin_lock(&existing_ref->lock);
 	if (ref->must_insert_reserved) {
 		/* if the extent was freed and then
 		 * reallocated before the delayed ref
@@ -414,13 +533,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 					ref->extent_op->flags_to_set;
 				existing_ref->extent_op->update_flags = 1;
 			}
-			kfree(ref->extent_op);
+			btrfs_free_delayed_extent_op(ref->extent_op);
 		}
 	}
 	/*
-	 * update the reference mod on the head to reflect this new operation
+	 * update the reference mod on the head to reflect this new operation,
+	 * only need the lock for this case cause we could be processing it
+	 * currently, for refs we just added we know we're a-ok.
 	 */
 	existing->ref_mod += update->ref_mod;
+	spin_unlock(&existing_ref->lock);
 }
 
 /*
@@ -428,12 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
-					struct btrfs_delayed_ref_node *ref,
-					u64 bytenr, u64 num_bytes,
-					int action, int is_data)
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, int action, int is_data)
 {
-	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
@@ -475,47 +598,59 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 	ref->action  = 0;
 	ref->is_head = 1;
 	ref->in_tree = 1;
+	ref->seq = 0;
 
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
+	head_ref->ref_root = RB_ROOT;
+	head_ref->processing = 0;
 
-	INIT_LIST_HEAD(&head_ref->cluster);
+	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+	trace_add_delayed_ref_head(ref, head_ref, action);
 
+	existing = htree_insert(&delayed_refs->href_root,
+				&head_ref->href_node);
 	if (existing) {
-		update_existing_head_ref(existing, ref);
+		update_existing_head_ref(&existing->node, ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+		head_ref = existing;
 	} else {
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
-		delayed_refs->num_entries++;
+		atomic_inc(&delayed_refs->num_entries);
 		trans->delayed_ref_updates++;
 	}
-	return 0;
+	return head_ref;
 }
 
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_delayed_ref_node *ref,
-					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action)
+static noinline void
+add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, u64 parent, u64 ref_root, int level,
+		     int action, int no_quota)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
 
+	if (is_fstree(ref_root))
+		seq = atomic64_read(&fs_info->tree_mod_seq);
 	delayed_refs = &trans->transaction->delayed_refs;
 
 	/* first set the basic ref node struct up */
@@ -526,51 +661,61 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	ref->action = action;
 	ref->is_head = 0;
 	ref->in_tree = 1;
+	ref->no_quota = no_quota;
+	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-	}
 	full_ref->level = level;
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+	trace_add_delayed_tree_ref(ref, full_ref, action);
 
+	spin_lock(&head_ref->lock);
+	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
 	if (existing) {
-		update_existing_ref(trans, delayed_refs, existing, ref);
+		update_existing_ref(trans, delayed_refs, head_ref, existing,
+				    ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
 	} else {
-		delayed_refs->num_entries++;
+		atomic_inc(&delayed_refs->num_entries);
 		trans->delayed_ref_updates++;
 	}
-	return 0;
+	spin_unlock(&head_ref->lock);
 }
 
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_delayed_ref_node *ref,
-					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, u64 owner, u64 offset,
-					 int action)
+static noinline void
+add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+		     struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_head *head_ref,
+		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
+		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
+		     u64 offset, int action, int no_quota)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
+	if (is_fstree(ref_root))
+		seq = atomic64_read(&fs_info->tree_mod_seq);
+
 	/* first set the basic ref node struct up */
 	atomic_set(&ref->refs, 1);
 	ref->bytenr = bytenr;
@@ -579,32 +724,37 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	ref->action = action;
 	ref->is_head = 0;
 	ref->in_tree = 1;
+	ref->no_quota = no_quota;
+	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_DATA_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-	}
+
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
 
-	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+	trace_add_delayed_data_ref(ref, full_ref, action);
 
+	spin_lock(&head_ref->lock);
+	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
 	if (existing) {
-		update_existing_ref(trans, delayed_refs, existing, ref);
+		update_existing_ref(trans, delayed_refs, head_ref, existing,
+				    ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 	} else {
-		delayed_refs->num_entries++;
+		atomic_inc(&delayed_refs->num_entries);
 		trans->delayed_ref_updates++;
 	}
-	return 0;
+	spin_unlock(&head_ref->lock);
 }
 
 /*
@@ -612,24 +762,28 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		no_quota = 0;
 
 	BUG_ON(extent_op && extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
 		return -ENOMEM;
 	}
 
@@ -642,39 +796,43 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 0);
-	BUG_ON(ret);
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+					bytenr, num_bytes, action, 0);
 
-	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, level, action);
-	BUG_ON(ret);
+	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, level, action,
+				   no_quota);
 	spin_unlock(&delayed_refs->lock);
+
 	return 0;
 }
 
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		no_quota = 0;
 
 	BUG_ON(extent_op && !extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
 		return -ENOMEM;
 	}
 
@@ -687,26 +845,26 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 1);
-	BUG_ON(ret);
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+					bytenr, num_bytes, action, 1);
 
-	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, owner, offset, action);
-	BUG_ON(ret);
+	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, owner, offset,
+				   action, no_quota);
 	spin_unlock(&delayed_refs->lock);
+
 	return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref)
 		return -ENOMEM;
 
@@ -715,10 +873,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
-	BUG_ON(ret);
 
 	spin_unlock(&delayed_refs->lock);
 	return 0;
@@ -732,88 +889,56 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 {
-	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-	if (ref)
-		return btrfs_delayed_node_to_head(ref);
-	return NULL;
+	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
 }
 
-/*
- * add a delayed ref to the tree.  This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- *
- * The main point of this call is to add and remove a backreference in a single
- * shot, taking the lock only once, and only searching for the head node once.
- *
- * It is the same as doing a ref add and delete in two separate calls.
- */
-#if 0
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 orig_parent,
-			  u64 parent, u64 orig_ref_root, u64 ref_root,
-			  u64 orig_ref_generation, u64 ref_generation,
-			  u64 owner_objectid, int pin)
+void btrfs_delayed_ref_exit(void)
 {
-	struct btrfs_delayed_ref *ref;
-	struct btrfs_delayed_ref *old_ref;
-	struct btrfs_delayed_ref_head *head_ref;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
-	if (!old_ref) {
-		kfree(ref);
-		return -ENOMEM;
-	}
-
-	/*
-	 * the parent = 0 case comes from cases where we don't actually
-	 * know the parent yet.  It will get updated later via a add/drop
-	 * pair.
-	 */
-	if (parent == 0)
-		parent = bytenr;
-	if (orig_parent == 0)
-		orig_parent = bytenr;
+	if (btrfs_delayed_ref_head_cachep)
+		kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+	if (btrfs_delayed_tree_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+	if (btrfs_delayed_data_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+	if (btrfs_delayed_extent_op_cachep)
+		kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-	if (!head_ref) {
-		kfree(ref);
-		kfree(old_ref);
-		return -ENOMEM;
-	}
-	delayed_refs = &trans->transaction->delayed_refs;
-	spin_lock(&delayed_refs->lock);
+int btrfs_delayed_ref_init(void)
+{
+	btrfs_delayed_ref_head_cachep = kmem_cache_create(
+				"btrfs_delayed_ref_head",
+				sizeof(struct btrfs_delayed_ref_head), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_ref_head_cachep)
+		goto fail;
+
+	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_tree_ref",
+				sizeof(struct btrfs_delayed_tree_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_tree_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_data_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_data_ref",
+				sizeof(struct btrfs_delayed_data_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_data_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_extent_op_cachep = kmem_cache_create(
+				"btrfs_delayed_extent_op",
+				sizeof(struct btrfs_delayed_extent_op), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_extent_op_cachep)
+		goto fail;
 
-	/*
-	 * insert both the head node and the new ref without dropping
-	 * the spin lock
-	 */
-	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-				      (u64)-1, 0, 0, 0,
-				      BTRFS_UPDATE_DELAYED_HEAD, 0);
-	BUG_ON(ret);
-
-	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-				      parent, ref_root, ref_generation,
-				      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
-	BUG_ON(ret);
-
-	ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
-				      orig_parent, orig_ref_root,
-				      orig_ref_generation, owner_objectid,
-				      BTRFS_DROP_DELAYED_REF, pin);
-	BUG_ON(ret);
-	spin_unlock(&delayed_refs->lock);
 	return 0;
+fail:
+	btrfs_delayed_ref_exit();
+	return -ENOMEM;
 }
-#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 50e3cf92fbd..a764e2340d4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,7 +18,7 @@
 #ifndef __DELAYED_REF__
 #define __DELAYED_REF__
 
-/* these are the possible values of struct btrfs_delayed_ref->action */
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
 #define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
 #define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
 	/* the size of the extent */
 	u64 num_bytes;
 
+	/* seq number to keep track of insertion order */
+	u64 seq;
+
 	/* ref count on this data structure */
 	atomic_t refs;
 
@@ -49,6 +52,7 @@ struct btrfs_delayed_ref_node {
 
 	unsigned int action:8;
 	unsigned int type:8;
+	unsigned int no_quota:1;
 	/* is this node still in the rbtree? */
 	unsigned int is_head:1;
 	unsigned int in_tree:1;
@@ -57,6 +61,7 @@ struct btrfs_delayed_ref_node {
 struct btrfs_delayed_extent_op {
 	struct btrfs_disk_key key;
 	u64 flags_to_set;
+	int level;
 	unsigned int update_key:1;
 	unsigned int update_flags:1;
 	unsigned int is_data:1;
@@ -77,7 +82,10 @@ struct btrfs_delayed_ref_head {
 	 */
 	struct mutex mutex;
 
-	struct list_head cluster;
+	spinlock_t lock;
+	struct rb_root ref_root;
+
+	struct rb_node href_node;
 
 	struct btrfs_delayed_extent_op *extent_op;
 	/*
@@ -94,29 +102,27 @@ struct btrfs_delayed_ref_head {
 	 */
 	unsigned int must_insert_reserved:1;
 	unsigned int is_data:1;
+	unsigned int processing:1;
 };
 
 struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	int level;
 };
 
 struct btrfs_delayed_data_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	u64 objectid;
 	u64 offset;
 };
 
 struct btrfs_delayed_ref_root {
-	struct rb_root root;
+	/* head ref rbtree */
+	struct rb_root href_root;
 
 	/* this spin lock protects the rbtree and the entries inside */
 	spinlock_t lock;
@@ -124,7 +130,7 @@ struct btrfs_delayed_ref_root {
 	/* how many delayed ref updates we've queued, used by the
 	 * throttling code
 	 */
-	unsigned long num_entries;
+	atomic_t num_entries;
 
 	/* total number of head nodes in tree */
 	unsigned long num_heads;
@@ -142,40 +148,89 @@ struct btrfs_delayed_ref_root {
 	u64 run_delayed_start;
 };
 
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+	return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+	if (op)
+		kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
 	WARN_ON(atomic_read(&ref->refs) == 0);
 	if (atomic_dec_and_test(&ref->refs)) {
 		WARN_ON(ref->in_tree);
-		kfree(ref);
+		switch (ref->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+		case BTRFS_SHARED_DATA_REF_KEY:
+			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+			break;
+		case 0:
+			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+			break;
+		default:
+			BUG();
+		}
 	}
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int no_quota);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 orig_parent,
-			  u64 parent, u64 orig_ref_root, u64 ref_root,
-			  u64 orig_ref_generation, u64 ref_generation,
-			  u64 owner_objectid, int pin);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_head *head);
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
-			   struct list_head *cluster, u64 search_start);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+	mutex_unlock(&head->mutex);
+}
+
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq);
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 00000000000..eea26e1b2fd
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,932 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	int item_size;
+	struct btrfs_dev_replace_item *ptr;
+	u64 src_devid;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+	if (ret) {
+no_valid_dev_replace_entry_found:
+		ret = 0;
+		dev_replace->replace_state =
+			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+		dev_replace->cont_reading_from_srcdev_mode =
+		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+		dev_replace->replace_state = 0;
+		dev_replace->time_started = 0;
+		dev_replace->time_stopped = 0;
+		atomic64_set(&dev_replace->num_write_errors, 0);
+		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+		dev_replace->cursor_left = 0;
+		dev_replace->committed_cursor_left = 0;
+		dev_replace->cursor_left_last_write_of_item = 0;
+		dev_replace->cursor_right = 0;
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		dev_replace->is_valid = 0;
+		dev_replace->item_needs_writeback = 0;
+		goto out;
+	}
+	slot = path->slots[0];
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+		btrfs_warn(fs_info,
+			"dev_replace entry found has unexpected size, ignore entry");
+		goto no_valid_dev_replace_entry_found;
+	}
+
+	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+	dev_replace->cont_reading_from_srcdev_mode =
+		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+	dev_replace->time_stopped =
+		btrfs_dev_replace_time_stopped(eb, ptr);
+	atomic64_set(&dev_replace->num_write_errors,
+		     btrfs_dev_replace_num_write_errors(eb, ptr));
+	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+	dev_replace->committed_cursor_left = dev_replace->cursor_left;
+	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+	dev_replace->is_valid = 1;
+
+	dev_replace->item_needs_writeback = 0;
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		dev_replace->srcdev = NULL;
+		dev_replace->tgtdev = NULL;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+							NULL, NULL);
+		dev_replace->tgtdev = btrfs_find_device(fs_info,
+							BTRFS_DEV_REPLACE_DEVID,
+							NULL, NULL);
+		/*
+		 * allow 'btrfs dev replace_cancel' if src/tgt device is
+		 * missing
+		 */
+		if (!dev_replace->srcdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			btrfs_warn(fs_info,
+			   "cannot mount because device replace operation is ongoing and");
+			btrfs_warn(fs_info,
+			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+			   src_devid);
+		}
+		if (!dev_replace->tgtdev &&
+		    !btrfs_test_opt(dev_root, DEGRADED)) {
+			ret = -EIO;
+			btrfs_warn(fs_info,
+			   "cannot mount because device replace operation is ongoing and");
+			btrfs_warn(fs_info,
+			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+				BTRFS_DEV_REPLACE_DEVID);
+		}
+		if (dev_replace->tgtdev) {
+			if (dev_replace->srcdev) {
+				dev_replace->tgtdev->total_bytes =
+					dev_replace->srcdev->total_bytes;
+				dev_replace->tgtdev->disk_total_bytes =
+					dev_replace->srcdev->disk_total_bytes;
+				dev_replace->tgtdev->bytes_used =
+					dev_replace->srcdev->bytes_used;
+			}
+			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+				dev_replace->tgtdev);
+		}
+		break;
+	}
+
+out:
+	if (path)
+		btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_replace_item *ptr;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (!dev_replace->is_valid ||
+	    !dev_replace->item_needs_writeback) {
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_REPLACE_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
+			ret);
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/*
+		 * need to delete old one and insert a new one.
+		 * Since no attempt is made to recover any old state, if the
+		 * dev_replace state is 'running', the data on the target
+		 * drive is lost.
+		 * It would be possible to recover the state: just make sure
+		 * that the beginning of the item is never changed and always
+		 * contains all the essential information. Then read this
+		 * minimal set of information and use it as a base for the
+		 * new state.
+		 */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
+				ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			btrfs_warn(fs_info, "insert dev_replace item failed %d!",
+				ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0],
+			     struct btrfs_dev_replace_item);
+
+	btrfs_dev_replace_lock(dev_replace);
+	if (dev_replace->srcdev)
+		btrfs_set_dev_replace_src_devid(eb, ptr,
+			dev_replace->srcdev->devid);
+	else
+		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+		dev_replace->cont_reading_from_srcdev_mode);
+	btrfs_set_dev_replace_replace_state(eb, ptr,
+		dev_replace->replace_state);
+	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+	btrfs_set_dev_replace_num_write_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_write_errors));
+	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+	dev_replace->cursor_left_last_write_of_item =
+		dev_replace->cursor_left;
+	btrfs_set_dev_replace_cursor_left(eb, ptr,
+		dev_replace->cursor_left_last_write_of_item);
+	btrfs_set_dev_replace_cursor_right(eb, ptr,
+		dev_replace->cursor_right);
+	dev_replace->item_needs_writeback = 0;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	dev_replace->committed_cursor_left =
+		dev_replace->cursor_left_last_write_of_item;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_device *src_device = NULL;
+
+	if (btrfs_fs_incompat(fs_info, RAID56)) {
+		btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
+		return -EOPNOTSUPP;
+	}
+
+	switch (args->start.cont_reading_from_srcdev_mode) {
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+	    args->start.tgtdev_name[0] == '\0')
+		return -EINVAL;
+
+	mutex_lock(&fs_info->volume_mutex);
+	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+					    &tgt_device);
+	if (ret) {
+		btrfs_err(fs_info, "target device %s is invalid!",
+		       args->start.tgtdev_name);
+		mutex_unlock(&fs_info->volume_mutex);
+		return -EINVAL;
+	}
+
+	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+					    args->start.srcdev_name,
+					    &src_device);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (ret) {
+		ret = -EINVAL;
+		goto leave_no_lock;
+	}
+
+	if (tgt_device->total_bytes < src_device->total_bytes) {
+		btrfs_err(fs_info, "target device is smaller than source device!");
+		ret = -EINVAL;
+		goto leave_no_lock;
+	}
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+		goto leave;
+	}
+
+	dev_replace->cont_reading_from_srcdev_mode =
+		args->start.cont_reading_from_srcdev_mode;
+	WARN_ON(!src_device);
+	dev_replace->srcdev = src_device;
+	WARN_ON(!tgt_device);
+	dev_replace->tgtdev = tgt_device;
+
+	printk_in_rcu(KERN_INFO
+		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+		      src_device->missing ? "<missing disk>" :
+		        rcu_str_deref(src_device->name),
+		      src_device->devid,
+		      rcu_str_deref(tgt_device->name));
+
+	tgt_device->total_bytes = src_device->total_bytes;
+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+	tgt_device->bytes_used = src_device->bytes_used;
+
+	/*
+	 * from now on, the writes to the srcdev are all duplicated to
+	 * go to the tgtdev as well (refer to btrfs_map_block()).
+	 */
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+	dev_replace->time_started = get_seconds();
+	dev_replace->cursor_left = 0;
+	dev_replace->committed_cursor_left = 0;
+	dev_replace->cursor_left_last_write_of_item = 0;
+	dev_replace->cursor_right = 0;
+	dev_replace->is_valid = 1;
+	dev_replace->item_needs_writeback = 1;
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	btrfs_wait_ordered_roots(root->fs_info, -1);
+
+	/* force writing the updated state information to disk */
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_dev_replace_lock(dev_replace);
+		goto leave;
+	}
+
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* the disk copy procedure reuses the scrub code */
+	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+			      src_device->total_bytes,
+			      &dev_replace->scrub_progress, 0, 1);
+
+	ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+	WARN_ON(ret);
+
+	return 0;
+
+leave:
+	dev_replace->srcdev = NULL;
+	dev_replace->tgtdev = NULL;
+	btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+	return ret;
+}
+
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	do {
+		prepare_to_wait(&fs_info->replace_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		writers = percpu_counter_sum(&fs_info->bio_counter);
+		if (writers)
+			schedule();
+		finish_wait(&fs_info->replace_wait, &wait);
+	} while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+				       int scrub_ret)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device;
+	struct btrfs_device *src_device;
+	struct btrfs_root *root = fs_info->tree_root;
+	u8 uuid_tmp[BTRFS_UUID_SIZE];
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	/* don't allow cancel or unmount to disturb the finishing procedure */
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* was the operation canceled, or is it finished? */
+	if (dev_replace->replace_state !=
+	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return 0;
+	}
+
+	tgt_device = dev_replace->tgtdev;
+	src_device = dev_replace->srcdev;
+	btrfs_dev_replace_unlock(dev_replace);
+
+	/*
+	 * flush all outstanding I/O and inode extent mappings before the
+	 * copy operation is declared as being finished
+	 */
+	ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+	if (ret) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return ret;
+	}
+	btrfs_wait_ordered_roots(root->fs_info, -1);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+
+	/* keep away write_all_supers() during the finishing procedure */
+	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace->replace_state =
+		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+	dev_replace->tgtdev = NULL;
+	dev_replace->srcdev = NULL;
+	dev_replace->time_stopped = get_seconds();
+	dev_replace->item_needs_writeback = 1;
+
+	/* replace old device with new one in mapping tree */
+	if (!scrub_ret) {
+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+								src_device,
+								tgt_device);
+	} else {
+		printk_in_rcu(KERN_ERR
+			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+			      src_device->missing ? "<missing disk>" :
+			        rcu_str_deref(src_device->name),
+			      src_device->devid,
+			      rcu_str_deref(tgt_device->name), scrub_ret);
+		btrfs_dev_replace_unlock(dev_replace);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		mutex_unlock(&root->fs_info->chunk_mutex);
+		if (tgt_device)
+			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+		return 0;
+	}
+
+	printk_in_rcu(KERN_INFO
+		      "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
+		      src_device->missing ? "<missing disk>" :
+		        rcu_str_deref(src_device->name),
+		      src_device->devid,
+		      rcu_str_deref(tgt_device->name));
+	tgt_device->is_tgtdev_for_dev_replace = 0;
+	tgt_device->devid = src_device->devid;
+	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+	tgt_device->bytes_used = src_device->bytes_used;
+	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+	tgt_device->total_bytes = src_device->total_bytes;
+	tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+	tgt_device->bytes_used = src_device->bytes_used;
+	if (fs_info->sb->s_bdev == src_device->bdev)
+		fs_info->sb->s_bdev = tgt_device->bdev;
+	if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+	/* replace the sysfs entry */
+	btrfs_kobj_rm_device(fs_info, src_device);
+	btrfs_kobj_add_device(fs_info, tgt_device);
+
+	btrfs_rm_dev_replace_blocked(fs_info);
+
+	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+
+	btrfs_rm_dev_replace_unblocked(fs_info);
+
+	/*
+	 * this is again a consistent state where no dev_replace procedure
+	 * is running, the target device is part of the filesystem, the
+	 * source device is not part of the filesystem anymore and its 1st
+	 * superblock is scratched out so that it is no longer marked to
+	 * belong to this filesystem.
+	 */
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+
+	/* write back the superblocks */
+	trans = btrfs_start_transaction(root, 0);
+	if (!IS_ERR(trans))
+		btrfs_commit_transaction(trans, root);
+
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+	return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+						struct btrfs_fs_info *fs_info,
+						struct btrfs_device *srcdev,
+						struct btrfs_device *tgtdev)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 start = 0;
+	int i;
+
+	write_lock(&em_tree->lock);
+	do {
+		em = lookup_extent_mapping(em_tree, start, (u64)-1);
+		if (!em)
+			break;
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++)
+			if (srcdev == map->stripes[i].dev)
+				map->stripes[i].dev = tgtdev;
+		start = em->start + em->len;
+		free_extent_map(em);
+	} while (start);
+	write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+					 char *srcdev_name,
+					 struct btrfs_device **device)
+{
+	int ret;
+
+	if (srcdevid) {
+		ret = 0;
+		*device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+					    NULL);
+		if (!*device)
+			ret = -ENOENT;
+	} else {
+		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+							   device);
+	}
+	return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	/* even if !dev_replace_is_valid, the values are good enough for
+	 * the replace_status ioctl */
+	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+	args->status.replace_state = dev_replace->replace_state;
+	args->status.time_started = dev_replace->time_started;
+	args->status.time_stopped = dev_replace->time_stopped;
+	args->status.num_write_errors =
+		atomic64_read(&dev_replace->num_write_errors);
+	args->status.num_uncorrectable_read_errors =
+		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		args->status.progress_1000 = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+		args->status.progress_1000 = 1000;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+			div64_u64(dev_replace->srcdev->total_bytes, 1000));
+		break;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args)
+{
+	args->result = __btrfs_dev_replace_cancel(fs_info);
+	return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = fs_info->tree_root;
+	u64 result;
+	int ret;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+		btrfs_dev_replace_unlock(dev_replace);
+		goto leave;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+		tgt_device = dev_replace->tgtdev;
+		dev_replace->tgtdev = NULL;
+		dev_replace->srcdev = NULL;
+		break;
+	}
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+	dev_replace->time_stopped = get_seconds();
+	dev_replace->item_needs_writeback = 1;
+	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_scrub_cancel(fs_info);
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	WARN_ON(ret);
+	if (tgt_device)
+		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+	return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		dev_replace->time_stopped = get_seconds();
+		dev_replace->item_needs_writeback = 1;
+		btrfs_info(fs_info, "suspending dev_replace for unmount");
+		break;
+	}
+
+	btrfs_dev_replace_unlock(dev_replace);
+	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	btrfs_dev_replace_lock(dev_replace);
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		dev_replace->replace_state =
+			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+		break;
+	}
+	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
+		btrfs_info(fs_info,
+			"you may cancel the operation after 'mount -o degraded'");
+		btrfs_dev_replace_unlock(dev_replace);
+		return 0;
+	}
+	btrfs_dev_replace_unlock(dev_replace);
+
+	WARN_ON(atomic_xchg(
+		&fs_info->mutually_exclusive_operation_running, 1));
+	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+	return PTR_ERR_OR_ZERO(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_ioctl_dev_replace_args *status_args;
+	u64 progress;
+
+	status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+	if (status_args) {
+		btrfs_dev_replace_status(fs_info, status_args);
+		progress = status_args->status.progress_1000;
+		kfree(status_args);
+		do_div(progress, 10);
+		printk_in_rcu(KERN_INFO
+			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+			dev_replace->srcdev->missing ? "<missing disk>" :
+			rcu_str_deref(dev_replace->srcdev->name),
+			dev_replace->srcdev->devid,
+			dev_replace->tgtdev ?
+			rcu_str_deref(dev_replace->tgtdev->name) :
+			"<missing target disk>",
+			(unsigned int)progress);
+	}
+	btrfs_dev_replace_continue_on_mount(fs_info);
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+	return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret;
+
+	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+			      dev_replace->committed_cursor_left,
+			      dev_replace->srcdev->total_bytes,
+			      &dev_replace->scrub_progress, 0, 1);
+	ret = btrfs_dev_replace_finishing(fs_info, ret);
+	WARN_ON(ret);
+	return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+	if (!dev_replace->is_valid)
+		return 0;
+
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		return 0;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		/*
+		 * return true even if tgtdev is missing (this is
+		 * something that can happen if the dev_replace
+		 * procedure is suspended by an umount and then
+		 * the tgtdev is missing (or "btrfs dev scan") was
+		 * not called and the the filesystem is remounted
+		 * in degraded state. This does not stop the
+		 * dev_replace procedure. It needs to be canceled
+		 * manually if the cancelation is wanted.
+		 */
+		break;
+	}
+	return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+	/* the beginning is just an optimization for the typical case */
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+		/* this is not a nested case where the same thread
+		 * is trying to acqurire the same lock twice */
+		mutex_lock(&dev_replace->lock);
+		mutex_lock(&dev_replace->lock_management_lock);
+		dev_replace->lock_owner = current->pid;
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_lock(&dev_replace->lock_management_lock);
+	if (atomic_read(&dev_replace->nesting_level) > 0 &&
+	    dev_replace->lock_owner == current->pid) {
+		WARN_ON(!mutex_is_locked(&dev_replace->lock));
+		atomic_inc(&dev_replace->nesting_level);
+		mutex_unlock(&dev_replace->lock_management_lock);
+		return;
+	}
+
+	mutex_unlock(&dev_replace->lock_management_lock);
+	goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+	WARN_ON(!mutex_is_locked(&dev_replace->lock));
+	mutex_lock(&dev_replace->lock_management_lock);
+	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+	WARN_ON(dev_replace->lock_owner != current->pid);
+	atomic_dec(&dev_replace->nesting_level);
+	if (atomic_read(&dev_replace->nesting_level) == 0) {
+		dev_replace->lock_owner = 0;
+		mutex_unlock(&dev_replace->lock_management_lock);
+		mutex_unlock(&dev_replace->lock);
+	} else {
+		mutex_unlock(&dev_replace->lock_management_lock);
+	}
+}
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_dec(&fs_info->bio_counter);
+
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+	DEFINE_WAIT(wait);
+again:
+	percpu_counter_inc(&fs_info->bio_counter);
+	if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+		btrfs_bio_counter_dec(fs_info);
+		wait_event(fs_info->replace_wait,
+			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+				     &fs_info->fs_state));
+		goto again;
+	}
+
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 00000000000..20035cbbf02
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+			  struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+			    struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+			      struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+			     struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+	atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f0cad5ae5be..a0691df5dce 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,6 +21,10 @@
 #include "hash.h"
 #include "transaction.h"
 
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len);
+
 /*
  * insert a name into a directory, doing overflow properly if there is a hash
  * collision.  data_size indicates how big the item inserted should be.  On
@@ -49,14 +53,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		di = btrfs_match_dir_item_name(root, path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
-		ret = btrfs_extend_item(trans, root, path, data_size);
-		WARN_ON(ret > 0);
-	}
-	if (ret < 0)
+		btrfs_extend_item(root, path, data_size);
+	} else if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
 	BUG_ON(data_size > btrfs_item_size(leaf, item));
 	ptr += btrfs_item_size(leaf, item) - data_size;
@@ -90,13 +92,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	data_size = sizeof(*dir_item) + name_len + data_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
-	/*
-	 * FIXME: at some point we should handle xattr's that are larger than
-	 * what we can fit in our leaf.  We set location to NULL b/c we arent
-	 * pointing at anything else, that will change if we store the xattr
-	 * data in a separate inode.
-	 */
-	BUG_ON(IS_ERR(dir_item));
+	if (IS_ERR(dir_item))
+		return PTR_ERR(dir_item);
 	memset(&location, 0, sizeof(location));
 
 	leaf = path->nodes[0];
@@ -122,10 +119,12 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
  * 'location' is the key to stuff into the directory item, 'type' is the
  * type of the inode we're pointing to, and 'index' is the sequence number
  * to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
  */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir,
-			  struct btrfs_key *location, u8 type, u64 index)
+			  *root, const char *name, int name_len,
+			  struct inode *dir, struct btrfs_key *location,
+			  u8 type, u64 index)
 {
 	int ret = 0;
 	int ret2 = 0;
@@ -137,13 +136,17 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_disk_key disk_key;
 	u32 data_size;
 
-	key.objectid = dir;
+	key.objectid = btrfs_ino(dir);
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	key.offset = btrfs_name_hash(name, name_len);
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 	path->leave_spinning = 1;
 
+	btrfs_cpu_key_to_disk(&disk_key, location);
+
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
@@ -151,11 +154,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = PTR_ERR(dir_item);
 		if (ret == -EEXIST)
 			goto second_insert;
-		goto out;
+		goto out_free;
 	}
 
 	leaf = path->nodes[0];
-	btrfs_cpu_key_to_disk(&disk_key, location);
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
 	btrfs_set_dir_type(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
@@ -170,29 +172,13 @@ second_insert:
 	/* FIXME, use some real flag for selecting the extra index */
 	if (root == root->fs_info->tree_root) {
 		ret = 0;
-		goto out;
+		goto out_free;
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = index;
-	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
-					name, name_len);
-	if (IS_ERR(dir_item)) {
-		ret2 = PTR_ERR(dir_item);
-		goto out;
-	}
-	leaf = path->nodes[0];
-	btrfs_cpu_key_to_disk(&disk_key, location);
-	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
-	btrfs_set_dir_type(leaf, dir_item, type);
-	btrfs_set_dir_data_len(leaf, dir_item, 0);
-	btrfs_set_dir_name_len(leaf, dir_item, name_len);
-	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
-	name_ptr = (unsigned long)(dir_item + 1);
-	write_extent_buffer(leaf, name, name_ptr, name_len);
-	btrfs_mark_buffer_dirty(leaf);
-out:
+	ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
+					      &disk_key, type, index);
+out_free:
 	btrfs_free_path(path);
 	if (ret)
 		return ret;
@@ -216,8 +202,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
 
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -227,21 +211,69 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
 		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+				   const char *name, int name_len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_item *di;
+	int data_size;
+	struct extent_buffer *leaf;
+	int slot;
+	struct btrfs_path *path;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+	/* return back any errors */
+	if (ret < 0)
+		goto out;
+
+	/* nothing found, we're safe */
 	if (ret > 0) {
-		if (path->slots[0] == 0)
-			return NULL;
-		path->slots[0]--;
+		ret = 0;
+		goto out;
 	}
 
-	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-	if (found_key.objectid != dir ||
-	    btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
-	    found_key.offset != key.offset)
-		return NULL;
+	/* we found an item, look for our name in the item */
+	di = btrfs_match_dir_item_name(root, path, name, name_len);
+	if (di) {
+		/* our exact name was found */
+		ret = -EEXIST;
+		goto out;
+	}
 
-	return btrfs_match_dir_item_name(root, path, name, name_len);
+	/*
+	 * see if there is room in the item to insert this
+	 * name
+	 */
+	data_size = sizeof(*di) + name_len;
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	if (data_size + btrfs_item_size_nr(leaf, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+		ret = -EOVERFLOW;
+	} else {
+		/* plenty of insertion room */
+		ret = 0;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
 }
 
 /*
@@ -333,8 +365,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
 
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
@@ -342,18 +372,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
 		return ERR_PTR(ret);
-	if (ret > 0) {
-		if (path->slots[0] == 0)
-			return NULL;
-		path->slots[0]--;
-	}
-
-	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-	if (found_key.objectid != dir ||
-	    btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
-	    found_key.offset != key.offset)
+	if (ret > 0)
 		return NULL;
 
 	return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -364,7 +383,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
  * this walks through all the entries in a dir item and finds one
  * for a specific name.
  */
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
 {
@@ -377,6 +396,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 
 	leaf = path->nodes[0];
 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+	if (verify_dir_item(root, leaf, dir_item))
+		return NULL;
+
 	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
 	while (cur < total_len) {
 		this_len = sizeof(*dir_item) +
@@ -424,8 +446,41 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
-		ret = btrfs_truncate_item(trans, root, path,
-					  item_len - sub_item_len, 1);
+		btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
 	}
 	return ret;
 }
+
+int verify_dir_item(struct btrfs_root *root,
+		    struct extent_buffer *leaf,
+		    struct btrfs_dir_item *dir_item)
+{
+	u16 namelen = BTRFS_NAME_LEN;
+	u8 type = btrfs_dir_type(leaf, dir_item);
+
+	if (type >= BTRFS_FT_MAX) {
+		btrfs_crit(root->fs_info, "invalid dir item type: %d",
+		       (int)type);
+		return 1;
+	}
+
+	if (type == BTRFS_FT_XATTR)
+		namelen = XATTR_NAME_MAX;
+
+	if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+		btrfs_crit(root->fs_info, "invalid dir item name len: %u",
+		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+		return 1;
+	}
+
+	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+	if ((btrfs_dir_data_len(leaf, dir_item) +
+	     btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+		btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
+		       (unsigned)btrfs_dir_name_len(leaf, dir_item),
+		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 100b07f021b..08e65e9cf2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,12 +26,15 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/crc32c.h>
 #include <linux/slab.h>
 #include <linux/migrate.h>
-#include "compat.h"
+#include <linux/ratelimit.h>
+#include <linux/uuid.h>
+#include <linux/semaphore.h>
+#include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
@@ -40,24 +43,36 @@
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
+#include "inode-map.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "raid56.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 				    int read_only);
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+					     struct btrfs_root *root);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_root *root);
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages,
 					int mark);
 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 				       struct extent_io_tree *pinned_extents);
 static int btrfs_cleanup_transaction(struct btrfs_root *root);
+static void btrfs_error_commit_super(struct btrfs_root *root);
 
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
@@ -95,40 +110,87 @@ struct async_submit_bio {
 	 */
 	u64 bio_offset;
 	struct btrfs_work work;
+	int error;
 };
 
-/* These are used to set the lockdep class on the extent buffer locks.
- * The class is set by the readpage_end_io_hook after the buffer has
- * passed csum validation but before the pages are unlocked.
+/*
+ * Lockdep class keys for extent_buffer->lock's in this root.  For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
  *
- * The lockdep class is also set by btrfs_init_new_buffer on freshly
- * allocated blocks.
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets.  As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->objectid.  This ensures that all special purpose roots
+ * have separate keysets.
  *
- * The class is based on the level in the tree block, which allows lockdep
- * to know that lower nodes nest inside the locks of higher nodes.
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock.  As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
  *
- * We also add a check to make sure the highest level of the tree is
- * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
- * code needs update as well.
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked.  It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # if BTRFS_MAX_LEVEL != 8
 #  error
 # endif
-static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
-static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
-	/* leaf */
-	"btrfs-extent-00",
-	"btrfs-extent-01",
-	"btrfs-extent-02",
-	"btrfs-extent-03",
-	"btrfs-extent-04",
-	"btrfs-extent-05",
-	"btrfs-extent-06",
-	"btrfs-extent-07",
-	/* highest possible level */
-	"btrfs-extent-08",
+
+static struct btrfs_lockdep_keyset {
+	u64			id;		/* root objectid */
+	const char		*name_stem;	/* lock name stem */
+	char			names[BTRFS_MAX_LEVEL + 1][20];
+	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1];
+} btrfs_lockdep_keysets[] = {
+	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	},
+	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	},
+	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	},
+	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
+	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
+	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
+	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
+	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
+	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
+	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
+	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
+	{ .id = 0,				.name_stem = "tree"	},
 };
+
+void __init btrfs_init_lockdep(void)
+{
+	int i, j;
+
+	/* initialize lockdep class names */
+	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
+		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
+
+		for (j = 0; j < ARRAY_SIZE(ks->names); j++)
+			snprintf(ks->names[j], sizeof(ks->names[j]),
+				 "btrfs-%s-%02d", ks->name_stem, j);
+	}
+}
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
+				    int level)
+{
+	struct btrfs_lockdep_keyset *ks;
+
+	BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+	/* find the matching keyset, id 0 is the default entry */
+	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+		if (ks->id == objectid)
+			break;
+
+	lockdep_set_class_and_name(&eb->lock,
+				   &ks->keys[level], ks->names[level]);
+}
+
 #endif
 
 /*
@@ -136,7 +198,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
  * that covers the entire device
  */
 static struct extent_map *btree_get_extent(struct inode *inode,
-		struct page *page, size_t page_offset, u64 start, u64 len,
+		struct page *page, size_t pg_offset, u64 start, u64 len,
 		int create)
 {
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -153,7 +215,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	}
 	read_unlock(&em_tree->lock);
 
-	em = alloc_extent_map(GFP_NOFS);
+	em = alloc_extent_map();
 	if (!em) {
 		em = ERR_PTR(-ENOMEM);
 		goto out;
@@ -165,40 +227,30 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
 	if (ret == -EEXIST) {
-		u64 failed_start = em->start;
-		u64 failed_len = em->len;
-
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (em) {
-			ret = 0;
-		} else {
-			em = lookup_extent_mapping(em_tree, failed_start,
-						   failed_len);
-			ret = -EIO;
-		}
+		if (!em)
+			em = ERR_PTR(-EIO);
 	} else if (ret) {
 		free_extent_map(em);
-		em = NULL;
+		em = ERR_PTR(ret);
 	}
 	write_unlock(&em_tree->lock);
 
-	if (ret)
-		em = ERR_PTR(ret);
 out:
 	return em;
 }
 
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
-	return crc32c(seed, data, len);
+	return btrfs_crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, char *result)
 {
-	*(__le32 *)result = ~cpu_to_le32(crc);
+	put_unaligned_le32(~crc, result);
 }
 
 /*
@@ -208,13 +260,11 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
 	unsigned long offset = BTRFS_CSUM_SIZE;
-	char *map_token = NULL;
 	char *kaddr;
 	unsigned long map_start;
 	unsigned long map_len;
@@ -225,16 +275,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	len = buf->len - offset;
 	while (len > 0) {
 		err = map_private_extent_buffer(buf, offset, 32,
-					&map_token, &kaddr,
-					&map_start, &map_len, KM_USER0);
+					&kaddr, &map_start, &map_len);
 		if (err)
 			return 1;
 		cur_len = min(len, map_len - (offset - map_start));
-		crc = btrfs_csum_data(root, kaddr + offset - map_start,
+		crc = btrfs_csum_data(kaddr + offset - map_start,
 				      crc, cur_len);
 		len -= cur_len;
 		offset += cur_len;
-		unmap_extent_buffer(buf, map_token, KM_USER0);
 	}
 	if (csum_size > sizeof(inline_result)) {
 		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -253,14 +301,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			if (printk_ratelimit()) {
-				printk(KERN_INFO "btrfs: %s checksum verify "
-				       "failed on %llu wanted %X found %X "
-				       "level %d\n",
-				       root->fs_info->sb->s_id,
-				       (unsigned long long)buf->start, val, found,
-				       btrfs_header_level(buf));
-			}
+			printk_ratelimited(KERN_INFO
+				"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
+				"level %d\n",
+				root->fs_info->sb->s_id, buf->start,
+				val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -280,33 +325,96 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
  * in the wrong place.
  */
 static int verify_parent_transid(struct extent_io_tree *io_tree,
-				 struct extent_buffer *eb, u64 parent_transid)
+				 struct extent_buffer *eb, u64 parent_transid,
+				 int atomic)
 {
 	struct extent_state *cached_state = NULL;
 	int ret;
+	bool need_lock = (current->journal_info ==
+			  (void *)BTRFS_SEND_TRANS_STUB);
 
 	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 		return 0;
 
+	if (atomic)
+		return -EAGAIN;
+
+	if (need_lock) {
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+	}
+
 	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
-			 0, &cached_state, GFP_NOFS);
-	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+			 0, &cached_state);
+	if (extent_buffer_uptodate(eb) &&
 	    btrfs_header_generation(eb) == parent_transid) {
 		ret = 0;
 		goto out;
 	}
-	if (printk_ratelimit()) {
-		printk("parent transid verify failed on %llu wanted %llu "
+	printk_ratelimited("parent transid verify failed on %llu wanted %llu "
 		       "found %llu\n",
-		       (unsigned long long)eb->start,
-		       (unsigned long long)parent_transid,
-		       (unsigned long long)btrfs_header_generation(eb));
-	}
+		       eb->start, parent_transid, btrfs_header_generation(eb));
 	ret = 1;
-	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+
+	/*
+	 * Things reading via commit roots that don't have normal protection,
+	 * like send, can have a really old block in cache that may point at a
+	 * block that has been free'd and re-allocated.  So don't clear uptodate
+	 * if we find an eb that is under IO (dirty/writeback) because we could
+	 * end up reading in the stale data and then writing it back out and
+	 * making everybody very sad.
+	 */
+	if (!extent_buffer_under_io(eb))
+		clear_extent_buffer_uptodate(eb);
 out:
 	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
 			     &cached_state, GFP_NOFS);
+	if (need_lock)
+		btrfs_tree_read_unlock_blocking(eb);
+	return ret;
+}
+
+/*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+static int btrfs_check_super_csum(char *raw_disk_sb)
+{
+	struct btrfs_super_block *disk_sb =
+		(struct btrfs_super_block *)raw_disk_sb;
+	u16 csum_type = btrfs_super_csum_type(disk_sb);
+	int ret = 0;
+
+	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
+		u32 crc = ~(u32)0;
+		const int csum_size = sizeof(crc);
+		char result[csum_size];
+
+		/*
+		 * The super_block structure does not span the whole
+		 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
+		 * is filled with zeros and is included in the checkum.
+		 */
+		crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
+				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, result);
+
+		if (memcmp(raw_disk_sb, result, csum_size))
+			ret = 1;
+
+		if (ret && btrfs_super_generation(disk_sb) < 10) {
+			printk(KERN_WARNING
+				"BTRFS: super block crcs don't match, older mkfs detected\n");
+			ret = 0;
+		}
+	}
+
+	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+		printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
+				csum_type);
+		ret = 1;
+	}
+
 	return ret;
 }
 
@@ -319,28 +427,56 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  u64 start, u64 parent_transid)
 {
 	struct extent_io_tree *io_tree;
+	int failed = 0;
 	int ret;
 	int num_copies = 0;
 	int mirror_num = 0;
+	int failed_mirror = 0;
 
+	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+		ret = read_extent_buffer_pages(io_tree, eb, start,
+					       WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
-		if (!ret &&
-		    !verify_parent_transid(io_tree, eb, parent_transid))
-			return ret;
+		if (!ret) {
+			if (!verify_parent_transid(io_tree, eb,
+						   parent_transid, 0))
+				break;
+			else
+				ret = -EIO;
+		}
+
+		/*
+		 * This buffer's crc is fine, but its contents are corrupted, so
+		 * there is no reason to read the other copies, they won't be
+		 * any less wrong.
+		 */
+		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+			break;
 
-		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+		num_copies = btrfs_num_copies(root->fs_info,
 					      eb->start, eb->len);
 		if (num_copies == 1)
-			return ret;
+			break;
+
+		if (!failed_mirror) {
+			failed = 1;
+			failed_mirror = eb->read_mirror;
+		}
 
 		mirror_num++;
+		if (mirror_num == failed_mirror)
+			mirror_num++;
+
 		if (mirror_num > num_copies)
-			return ret;
+			break;
 	}
-	return -EIO;
+
+	if (failed && !ret && failed_mirror)
+		repair_eb_io_failure(root, eb, failed_mirror);
+
+	return ret;
 }
 
 /*
@@ -350,53 +486,17 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
-	struct extent_io_tree *tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 found_start;
-	unsigned long len;
 	struct extent_buffer *eb;
-	int ret;
-
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-	if (page->private == EXTENT_PAGE_PRIVATE) {
-		WARN_ON(1);
-		goto out;
-	}
-	if (!page->private) {
-		WARN_ON(1);
-		goto out;
-	}
-	len = page->private >> 2;
-	WARN_ON(len == 0);
-
-	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	if (eb == NULL) {
-		WARN_ON(1);
-		goto out;
-	}
-	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
-					     btrfs_header_generation(eb));
-	BUG_ON(ret);
-	WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
 
+	eb = (struct extent_buffer *)page->private;
+	if (page != eb->pages[0])
+		return 0;
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != start) {
-		WARN_ON(1);
-		goto err;
-	}
-	if (eb->first_page != page) {
-		WARN_ON(1);
-		goto err;
-	}
-	if (!PageUptodate(page)) {
-		WARN_ON(1);
-		goto err;
-	}
+	if (WARN_ON(found_start != start || !PageUptodate(page)))
+		return 0;
 	csum_tree_block(root, eb, 0);
-err:
-	free_extent_buffer(eb);
-out:
 	return 0;
 }
 
@@ -407,8 +507,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	u8 fsid[BTRFS_UUID_SIZE];
 	int ret = 1;
 
-	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
-			   BTRFS_FSID_SIZE);
+	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	while (fs_devices) {
 		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 			ret = 0;
@@ -419,83 +518,179 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+#define CORRUPT(reason, eb, root, slot)				\
+	btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu,"	\
+		   "root=%llu, slot=%d", reason,			\
+	       btrfs_header_bytenr(eb),	root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+			       struct extent_buffer *leaf)
 {
-	lockdep_set_class_and_name(&eb->lock,
-			   &btrfs_eb_class[level],
-			   btrfs_eb_name[level]);
+	struct btrfs_key key;
+	struct btrfs_key leaf_key;
+	u32 nritems = btrfs_header_nritems(leaf);
+	int slot;
+
+	if (nritems == 0)
+		return 0;
+
+	/* Check the 0 item */
+	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+	    BTRFS_LEAF_DATA_SIZE(root)) {
+		CORRUPT("invalid item offset size pair", leaf, root, 0);
+		return -EIO;
+	}
+
+	/*
+	 * Check to make sure each items keys are in the correct order and their
+	 * offsets make sense.  We only have to loop through nritems-1 because
+	 * we check the current slot against the next slot, which verifies the
+	 * next slot's offset+size makes sense and that the current's slot
+	 * offset is correct.
+	 */
+	for (slot = 0; slot < nritems - 1; slot++) {
+		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+		/* Make sure the keys are in the right order */
+		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+			CORRUPT("bad key order", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Make sure the offset and ends are right, remember that the
+		 * item data starts at the end of the leaf and grows towards the
+		 * front.
+		 */
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Check to make sure that we don't point outside of the leaf,
+		 * just incase all the items are consistent to eachother, but
+		 * all point outside of the leaf.
+		 */
+		if (btrfs_item_end_nr(leaf, slot) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			return -EIO;
+		}
+	}
+
+	return 0;
 }
-#endif
 
-static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
 {
-	struct extent_io_tree *tree;
 	u64 found_start;
 	int found_level;
-	unsigned long len;
 	struct extent_buffer *eb;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	int ret = 0;
+	int reads_done;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	if (page->private == EXTENT_PAGE_PRIVATE)
-		goto out;
 	if (!page->private)
 		goto out;
 
-	len = page->private >> 2;
-	WARN_ON(len == 0);
+	eb = (struct extent_buffer *)page->private;
 
-	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	if (eb == NULL) {
+	/* the pending IO might have been the only thing that kept this buffer
+	 * in memory.  Make sure we have a ref for all this other checks
+	 */
+	extent_buffer_get(eb);
+
+	reads_done = atomic_dec_and_test(&eb->io_pages);
+	if (!reads_done)
+		goto err;
+
+	eb->read_mirror = mirror;
+	if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
 		ret = -EIO;
-		goto out;
+		goto err;
 	}
 
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != start) {
-		if (printk_ratelimit()) {
-			printk(KERN_INFO "btrfs bad tree block start "
+	if (found_start != eb->start) {
+		printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
 			       "%llu %llu\n",
-			       (unsigned long long)found_start,
-			       (unsigned long long)eb->start);
-		}
+			       found_start, eb->start);
 		ret = -EIO;
 		goto err;
 	}
-	if (eb->first_page != page) {
-		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
-		       eb->first_page->index, page->index);
-		WARN_ON(1);
+	if (check_tree_block_fsid(root, eb)) {
+		printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
+			       eb->start);
 		ret = -EIO;
 		goto err;
 	}
-	if (check_tree_block_fsid(root, eb)) {
-		if (printk_ratelimit()) {
-			printk(KERN_INFO "btrfs bad fsid on block %llu\n",
-			       (unsigned long long)eb->start);
-		}
+	found_level = btrfs_header_level(eb);
+	if (found_level >= BTRFS_MAX_LEVEL) {
+		btrfs_info(root->fs_info, "bad tree block level %d",
+			   (int)btrfs_header_level(eb));
 		ret = -EIO;
 		goto err;
 	}
-	found_level = btrfs_header_level(eb);
 
-	btrfs_set_buffer_lockdep_class(eb, found_level);
+	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
+				       eb, found_level);
 
 	ret = csum_tree_block(root, eb, 1);
-	if (ret)
+	if (ret) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/*
+	 * If this is a leaf block and it is corrupt, set the corrupt bit so
+	 * that we don't try and read the other copies of this block, just
+	 * return -EIO.
+	 */
+	if (found_level == 0 && check_leaf(root, eb)) {
+		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
+	}
 
-	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
-	end = eb->start + end - 1;
+	if (!ret)
+		set_extent_buffer_uptodate(eb);
 err:
+	if (reads_done &&
+	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+		btree_readahead_hook(root, eb, eb->start, ret);
+
+	if (ret) {
+		/*
+		 * our io error hook is going to dec the io pages
+		 * again, we have to make sure it has something
+		 * to decrement
+		 */
+		atomic_inc(&eb->io_pages);
+		clear_extent_buffer_uptodate(eb);
+	}
 	free_extent_buffer(eb);
 out:
 	return ret;
 }
 
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
+{
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+	eb = (struct extent_buffer *)page->private;
+	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+	eb->read_mirror = failed_mirror;
+	atomic_dec(&eb->io_pages);
+	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+		btree_readahead_hook(root, eb, eb->start, -EIO);
+	return -EIO;	/* we fixed nothing */
+}
+
 static void end_workqueue_bio(struct bio *bio, int err)
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
@@ -503,26 +698,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
 
 	fs_info = end_io_wq->info;
 	end_io_wq->error = err;
-	end_io_wq->work.func = end_workqueue_fn;
-	end_io_wq->work.flags = 0;
+	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 
 	if (bio->bi_rw & REQ_WRITE) {
-		if (end_io_wq->metadata == 1)
-			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
-					   &end_io_wq->work);
-		else if (end_io_wq->metadata == 2)
-			btrfs_queue_worker(&fs_info->endio_freespace_worker,
-					   &end_io_wq->work);
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
+			btrfs_queue_work(fs_info->endio_meta_write_workers,
+					 &end_io_wq->work);
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
+			btrfs_queue_work(fs_info->endio_freespace_worker,
+					 &end_io_wq->work);
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+			btrfs_queue_work(fs_info->endio_raid56_workers,
+					 &end_io_wq->work);
 		else
-			btrfs_queue_worker(&fs_info->endio_write_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_write_workers,
+					 &end_io_wq->work);
 	} else {
-		if (end_io_wq->metadata)
-			btrfs_queue_worker(&fs_info->endio_meta_workers,
-					   &end_io_wq->work);
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+			btrfs_queue_work(fs_info->endio_raid56_workers,
+					 &end_io_wq->work);
+		else if (end_io_wq->metadata)
+			btrfs_queue_work(fs_info->endio_meta_workers,
+					 &end_io_wq->work);
 		else
-			btrfs_queue_worker(&fs_info->endio_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_workers,
+					 &end_io_wq->work);
 	}
 }
 
@@ -532,6 +732,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
  * 0 - if data
  * 1 - if normal metadta
  * 2 - if writing to the free space cache area
+ * 3 - raid parity work
  */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata)
@@ -556,25 +757,22 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 {
 	unsigned long limit = min_t(unsigned long,
-				    info->workers.max_workers,
+				    info->thread_pool_size,
 				    info->fs_devices->open_devices);
 	return 256 * limit;
 }
 
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
-{
-	return atomic_read(&info->nr_async_bios) >
-		btrfs_async_submit_limit(info);
-}
-
 static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
+	int ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	async->submit_bio_start(async->inode, async->rw, async->bio,
-			       async->mirror_num, async->bio_flags,
-			       async->bio_offset);
+	ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+				      async->mirror_num, async->bio_flags,
+				      async->bio_offset);
+	if (ret)
+		async->error = ret;
 }
 
 static void run_one_async_done(struct btrfs_work *work)
@@ -589,12 +787,16 @@ static void run_one_async_done(struct btrfs_work *work)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
-	atomic_dec(&fs_info->nr_async_submits);
-
-	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
+	/* If an error occured we just want to clean up the bio and move on */
+	if (async->error) {
+		bio_endio(async->bio, async->error);
+		return;
+	}
+
 	async->submit_bio_done(async->inode, async->rw, async->bio,
 			       async->mirror_num, async->bio_flags,
 			       async->bio_offset);
@@ -628,20 +830,20 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_start = submit_bio_start;
 	async->submit_bio_done = submit_bio_done;
 
-	async->work.func = run_one_async_start;
-	async->work.ordered_func = run_one_async_done;
-	async->work.ordered_free = run_one_async_free;
+	btrfs_init_work(&async->work, run_one_async_start,
+			run_one_async_done, run_one_async_free);
 
-	async->work.flags = 0;
 	async->bio_flags = bio_flags;
 	async->bio_offset = bio_offset;
 
+	async->error = 0;
+
 	atomic_inc(&fs_info->nr_async_submits);
 
 	if (rw & REQ_SYNC)
-		btrfs_set_work_high_prio(&async->work);
+		btrfs_set_work_high_priority(&async->work);
 
-	btrfs_queue_worker(&fs_info->workers, &async->work);
+	btrfs_queue_work(fs_info->workers, &async->work);
 
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
@@ -654,18 +856,18 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 static int btree_csum_one_bio(struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int bio_index = 0;
+	struct bio_vec *bvec;
 	struct btrfs_root *root;
+	int i, ret = 0;
 
-	WARN_ON(bio->bi_vcnt <= 0);
-	while (bio_index < bio->bi_vcnt) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
-		csum_dirty_buffer(root, bvec->bv_page);
-		bio_index++;
-		bvec++;
+		ret = csum_dirty_buffer(root, bvec->bv_page);
+		if (ret)
+			break;
 	}
-	return 0;
+
+	return ret;
 }
 
 static int __btree_submit_bio_start(struct inode *inode, int rw,
@@ -677,54 +879,83 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	btree_csum_one_bio(bio);
-	return 0;
+	return btree_csum_one_bio(bio);
 }
 
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int ret;
+
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
+}
+
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+	if (bio_flags & EXTENT_BIO_TREE_LOG)
+		return 0;
+#ifdef CONFIG_X86
+	if (cpu_has_xmm4_2)
+		return 0;
+#endif
+	return 1;
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int async = check_async_write(inode, bio_flags);
 	int ret;
 
-	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
-					  bio, 1);
-	BUG_ON(ret);
-
 	if (!(rw & REQ_WRITE)) {
 		/*
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
 		 */
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 0);
+		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, 1);
+		if (ret)
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else if (!async) {
+		ret = btree_csum_one_bio(bio);
+		if (ret)
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else {
+		/*
+		 * kthread helpers are used to submit writes so that
+		 * checksumming can happen in parallel across all CPUs
+		 */
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+					  inode, rw, bio, mirror_num, 0,
+					  bio_offset,
+					  __btree_submit_bio_start,
+					  __btree_submit_bio_done);
 	}
 
-	/*
-	 * kthread helpers are used to submit writes so that checksumming
-	 * can happen in parallel across all CPUs
-	 */
-	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, rw, bio, mirror_num, 0,
-				   bio_offset,
-				   __btree_submit_bio_start,
-				   __btree_submit_bio_done);
+	if (ret) {
+out_w_error:
+		bio_endio(bio, ret);
+	}
+	return ret;
 }
 
 #ifdef CONFIG_MIGRATION
 static int btree_migratepage(struct address_space *mapping,
-			struct page *newpage, struct page *page)
+			struct page *newpage, struct page *page,
+			enum migrate_mode mode)
 {
 	/*
 	 * we can't safely write a btree page from here,
@@ -739,118 +970,88 @@ static int btree_migratepage(struct address_space *mapping,
 	if (page_has_private(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
-	return migrate_page(mapping, newpage, page);
+	return migrate_page(mapping, newpage, page, mode);
 }
 #endif
 
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct extent_io_tree *tree;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct extent_buffer *eb;
-	int was_dirty;
-
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	if (!(current->flags & PF_MEMALLOC)) {
-		return extent_write_full_page(tree, page,
-					      btree_get_extent, wbc);
-	}
-
-	redirty_page_for_writepage(wbc, page);
-	eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-	WARN_ON(!eb);
-
-	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-	free_extent_buffer(eb);
-
-	unlock_page(page);
-	return 0;
-}
 
 static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_io_tree *tree;
-	tree = &BTRFS_I(mapping->host)->io_tree;
+	struct btrfs_fs_info *fs_info;
+	int ret;
+
 	if (wbc->sync_mode == WB_SYNC_NONE) {
-		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
-		u64 num_dirty;
-		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
+		fs_info = BTRFS_I(mapping->host)->root->fs_info;
 		/* this is a bit racy, but that's ok */
-		num_dirty = root->fs_info->dirty_metadata_bytes;
-		if (num_dirty < thresh)
+		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+					     BTRFS_DIRTY_METADATA_THRESH);
+		if (ret < 0)
 			return 0;
 	}
-	return extent_writepages(tree, mapping, btree_get_extent, wbc);
+	return btree_write_cache_pages(mapping, wbc);
 }
 
 static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btree_get_extent);
+	return extent_read_full_page(tree, page, btree_get_extent, 0);
 }
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct extent_io_tree *tree;
-	struct extent_map_tree *map;
-	int ret;
-
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	map = &BTRFS_I(page->mapping->host)->extent_tree;
-
-	ret = try_release_extent_state(map, tree, page, gfp_flags);
-	if (!ret)
-		return 0;
-
-	ret = try_release_extent_buffer(tree, page);
-	if (ret == 1) {
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
-		page_cache_release(page);
-	}
-
-	return ret;
+	return try_release_extent_buffer(page);
 }
 
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk(KERN_WARNING "btrfs warning page private not zero "
-		       "on page %llu\n", (unsigned long long)page_offset(page));
+		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
+			   "page private not zero on page %llu",
+			   (unsigned long long)page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
 	}
 }
 
+static int btree_set_page_dirty(struct page *page)
+{
+#ifdef DEBUG
+	struct extent_buffer *eb;
+
+	BUG_ON(!PagePrivate(page));
+	eb = (struct extent_buffer *)page->private;
+	BUG_ON(!eb);
+	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+	BUG_ON(!atomic_read(&eb->refs));
+	btrfs_assert_tree_locked(eb);
+#endif
+	return __set_page_dirty_nobuffers(page);
+}
+
 static const struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
-	.writepage	= btree_writepage,
 	.writepages	= btree_writepages,
 	.releasepage	= btree_releasepage,
 	.invalidatepage = btree_invalidatepage,
-	.sync_page	= block_sync_page,
 #ifdef CONFIG_MIGRATION
 	.migratepage	= btree_migratepage,
 #endif
+	.set_page_dirty = btree_set_page_dirty,
 };
 
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -864,42 +1065,70 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0, btree_get_extent, 0);
+				 buf, 0, WAIT_NONE, btree_get_extent, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
 
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return 0;
+
+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+				       btree_get_extent, mirror_num);
+	if (ret) {
+		free_extent_buffer(buf);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+		free_extent_buffer(buf);
+		return -EIO;
+	} else if (extent_buffer_uptodate(buf)) {
+		*eb = buf;
+	} else {
+		free_extent_buffer(buf);
+	}
+	return 0;
+}
+
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				bytenr, blocksize, GFP_NOFS);
-	return eb;
+	return find_extent_buffer(root->fs_info, bytenr);
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 bytenr, u32 blocksize)
 {
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_buffer *eb;
-
-	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				 bytenr, blocksize, NULL, GFP_NOFS);
-	return eb;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+		return alloc_test_extent_buffer(root->fs_info, bytenr,
+						blocksize);
+#endif
+	return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
 }
 
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-	return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
 					buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-	return filemap_fdatawait_range(buf->first_page->mapping,
+	return filemap_fdatawait_range(buf->pages[0]->mapping,
 				       buf->start, buf->start + buf->len - 1);
 }
 
@@ -914,42 +1143,64 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 		return NULL;
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-
-	if (ret == 0)
-		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
+	if (ret) {
+		free_extent_buffer(buf);
+		return NULL;
+	}
 	return buf;
 
 }
 
-int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct extent_buffer *buf)
+void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct extent_buffer *buf)
 {
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	if (btrfs_header_generation(buf) ==
-	    root->fs_info->running_transaction->transid) {
+	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
 		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-			spin_lock(&root->fs_info->delalloc_lock);
-			if (root->fs_info->dirty_metadata_bytes >= buf->len)
-				root->fs_info->dirty_metadata_bytes -= buf->len;
-			else
-				WARN_ON(1);
-			spin_unlock(&root->fs_info->delalloc_lock);
+			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+					     -buf->len,
+					     fs_info->dirty_metadata_batch);
+			/* ugh, clear_extent_buffer_dirty needs to lock the page */
+			btrfs_set_lock_blocking(buf);
+			clear_extent_buffer_dirty(buf);
 		}
+	}
+}
+
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+	struct btrfs_subvolume_writers *writers;
+	int ret;
 
-		/* ugh, clear_extent_buffer_dirty needs to lock the page */
-		btrfs_set_lock_blocking(buf);
-		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-					  buf);
+	writers = kmalloc(sizeof(*writers), GFP_NOFS);
+	if (!writers)
+		return ERR_PTR(-ENOMEM);
+
+	ret = percpu_counter_init(&writers->counter, 0);
+	if (ret < 0) {
+		kfree(writers);
+		return ERR_PTR(ret);
 	}
-	return 0;
+
+	init_waitqueue_head(&writers->wait);
+	return writers;
 }
 
-static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
-			u32 stripesize, struct btrfs_root *root,
-			struct btrfs_fs_info *fs_info,
-			u64 objectid)
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+	percpu_counter_destroy(&writers->counter);
+	kfree(writers);
+}
+
+static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+			 u32 stripesize, struct btrfs_root *root,
+			 struct btrfs_fs_info *fs_info,
+			 u64 objectid)
 {
 	root->node = NULL;
 	root->commit_root = NULL;
@@ -957,88 +1208,178 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
 	root->stripesize = stripesize;
-	root->ref_cows = 0;
-	root->track_dirty = 0;
-	root->in_radix = 0;
-	root->orphan_item_inserted = 0;
+	root->state = 0;
 	root->orphan_cleanup_state = 0;
 
-	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
+	root->nr_delalloc_inodes = 0;
+	root->nr_ordered_extents = 0;
 	root->name = NULL;
-	root->in_sysfs = 0;
 	root->inode_tree = RB_ROOT;
+	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
 	root->block_rsv = NULL;
 	root->orphan_block_rsv = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
-	INIT_LIST_HEAD(&root->orphan_list);
 	INIT_LIST_HEAD(&root->root_list);
-	spin_lock_init(&root->node_lock);
+	INIT_LIST_HEAD(&root->delalloc_inodes);
+	INIT_LIST_HEAD(&root->delalloc_root);
+	INIT_LIST_HEAD(&root->ordered_extents);
+	INIT_LIST_HEAD(&root->ordered_root);
+	INIT_LIST_HEAD(&root->logged_list[0]);
+	INIT_LIST_HEAD(&root->logged_list[1]);
 	spin_lock_init(&root->orphan_lock);
 	spin_lock_init(&root->inode_lock);
+	spin_lock_init(&root->delalloc_lock);
+	spin_lock_init(&root->ordered_extent_lock);
 	spin_lock_init(&root->accounting_lock);
+	spin_lock_init(&root->log_extents_lock[0]);
+	spin_lock_init(&root->log_extents_lock[1]);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	mutex_init(&root->ordered_extent_mutex);
+	mutex_init(&root->delalloc_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
+	INIT_LIST_HEAD(&root->log_ctxs[0]);
+	INIT_LIST_HEAD(&root->log_ctxs[1]);
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
-	root->log_batch = 0;
+	atomic_set(&root->log_batch, 0);
+	atomic_set(&root->orphan_inodes, 0);
+	atomic_set(&root->refs, 1);
+	atomic_set(&root->will_be_snapshoted, 0);
 	root->log_transid = 0;
+	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
-	extent_io_tree_init(&root->dirty_log_pages,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	if (fs_info)
+		extent_io_tree_init(&root->dirty_log_pages,
+				     fs_info->btree_inode->i_mapping);
 
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
-	root->defrag_trans_start = fs_info->generation;
+	if (fs_info)
+		root->defrag_trans_start = fs_info->generation;
+	else
+		root->defrag_trans_start = 0;
 	init_completion(&root->kobj_unregister);
-	root->defrag_running = 0;
 	root->root_key.objectid = objectid;
-	root->anon_super.s_root = NULL;
-	root->anon_super.s_dev = 0;
-	INIT_LIST_HEAD(&root->anon_super.s_list);
-	INIT_LIST_HEAD(&root->anon_super.s_instances);
-	init_rwsem(&root->anon_super.s_umount);
+	root->anon_dev = 0;
 
-	return 0;
+	spin_lock_init(&root->root_item_lock);
 }
 
-static int find_and_setup_root(struct btrfs_root *tree_root,
-			       struct btrfs_fs_info *fs_info,
-			       u64 objectid,
-			       struct btrfs_root *root)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
-	int ret;
-	u32 blocksize;
-	u64 generation;
+	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (root)
+		root->fs_info = fs_info;
+	return root;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_alloc_root(NULL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+	__setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+	set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+	root->alloc_bytenr = 0;
+
+	return root;
+}
+#endif
+
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	int ret = 0;
+	uuid_le uuid;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, objectid);
-	ret = btrfs_find_last_root(tree_root, objectid,
-				   &root->root_item, &root->root_key);
-	if (ret > 0)
-		return -ENOENT;
-	BUG_ON(ret);
+	root->root_key.objectid = objectid;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
 
-	generation = btrfs_root_generation(&root->root_item);
-	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, generation);
-	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
-		free_extent_buffer(root->node);
-		return -EIO;
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, objectid, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		leaf = NULL;
+		goto fail;
 	}
+
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, objectid);
+	root->node = leaf;
+
+	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
 	root->commit_root = btrfs_root_node(root);
-	return 0;
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+	root->root_item.flags = 0;
+	root->root_item.byte_limit = 0;
+	btrfs_set_root_bytenr(&root->root_item, leaf->start);
+	btrfs_set_root_generation(&root->root_item, trans->transid);
+	btrfs_set_root_level(&root->root_item, 0);
+	btrfs_set_root_refs(&root->root_item, 1);
+	btrfs_set_root_used(&root->root_item, leaf->len);
+	btrfs_set_root_last_snapshot(&root->root_item, 0);
+	btrfs_set_root_dirid(&root->root_item, 0);
+	uuid_le_gen(&uuid);
+	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
+	root->root_item.drop_level = 0;
+
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+	if (ret)
+		goto fail;
+
+	btrfs_tree_unlock(leaf);
+
+	return root;
+
+fail:
+	if (leaf) {
+		btrfs_tree_unlock(leaf);
+		free_extent_buffer(root->commit_root);
+		free_extent_buffer(leaf);
+	}
+	kfree(root);
+
+	return ERR_PTR(ret);
 }
 
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
@@ -1048,7 +1389,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct extent_buffer *leaf;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
+	root = btrfs_alloc_root(fs_info);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
@@ -1059,16 +1400,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
 	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
 	/*
+	 * DON'T set REF_COWS for log trees
+	 *
 	 * log trees do not get reference counted because they go away
 	 * before a real commit is actually done.  They do store pointers
 	 * to file data extents, and those reference counts still get
 	 * updated (along with back refs to the log tree).
 	 */
-	root->ref_cows = 0;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+				      BTRFS_TREE_LOG_OBJECTID, NULL,
+				      0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
@@ -1082,8 +1426,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->node = leaf;
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(root->node),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
 	return root;
@@ -1116,97 +1459,169 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	log_root->root_key.offset = root->root_key.objectid;
 
 	inode_item = &log_root->root_item.inode;
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
 
 	btrfs_set_root_node(&log_root->root_item, log_root->node);
 
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
 	root->log_transid = 0;
+	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-					       struct btrfs_key *location)
+static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+					       struct btrfs_key *key)
 {
 	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_path *path;
-	struct extent_buffer *l;
 	u64 generation;
 	u32 blocksize;
-	int ret = 0;
+	int ret;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
-	if (!root)
+	path = btrfs_alloc_path();
+	if (!path)
 		return ERR_PTR(-ENOMEM);
-	if (location->offset == (u64)-1) {
-		ret = find_and_setup_root(tree_root, fs_info,
-					  location->objectid, root);
-		if (ret) {
-			kfree(root);
-			return ERR_PTR(ret);
-		}
-		goto out;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root) {
+		ret = -ENOMEM;
+		goto alloc_fail;
 	}
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
-		     root, fs_info, location->objectid);
+		     root, fs_info, key->objectid);
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-	if (ret == 0) {
-		l = path->nodes[0];
-		read_extent_buffer(l, &root->root_item,
-				btrfs_item_ptr_offset(l, path->slots[0]),
-				sizeof(root->root_item));
-		memcpy(&root->root_key, location, sizeof(*location));
-	}
-	btrfs_free_path(path);
+	ret = btrfs_find_root(tree_root, key, path,
+			      &root->root_item, &root->root_key);
 	if (ret) {
-		kfree(root);
 		if (ret > 0)
 			ret = -ENOENT;
-		return ERR_PTR(ret);
+		goto find_fail;
 	}
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	if (!root->node) {
+		ret = -ENOMEM;
+		goto find_fail;
+	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+		ret = -EIO;
+		goto read_fail;
+	}
 	root->commit_root = btrfs_root_node(root);
-	BUG_ON(!root->node);
 out:
-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
-		root->ref_cows = 1;
-
+	btrfs_free_path(path);
 	return root;
+
+read_fail:
+	free_extent_buffer(root->node);
+find_fail:
+	kfree(root);
+alloc_fail:
+	root = ERR_PTR(ret);
+	goto out;
 }
 
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-					u64 root_objectid)
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location)
 {
 	struct btrfs_root *root;
 
-	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
-		return fs_info->tree_root;
-	if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
-		return fs_info->extent_root;
+	root = btrfs_read_tree_root(tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		set_bit(BTRFS_ROOT_REF_COWS, &root->state);
+		btrfs_check_and_init_root_item(&root->root_item);
+	}
 
+	return root;
+}
+
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_subvolume_writers *writers;
+
+	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+					GFP_NOFS);
+	if (!root->free_ino_pinned || !root->free_ino_ctl) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	writers = btrfs_alloc_subvolume_writers();
+	if (IS_ERR(writers)) {
+		ret = PTR_ERR(writers);
+		goto fail;
+	}
+	root->subv_writers = writers;
+
+	btrfs_init_free_ino_ctl(root);
+	spin_lock_init(&root->cache_lock);
+	init_waitqueue_head(&root->cache_wait);
+
+	ret = get_anon_bdev(&root->anon_dev);
+	if (ret)
+		goto free_writers;
+	return 0;
+
+free_writers:
+	btrfs_free_subvolume_writers(root->subv_writers);
+fail:
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
+	return ret;
+}
+
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					       u64 root_id)
+{
+	struct btrfs_root *root;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
-				 (unsigned long)root_objectid);
+				 (unsigned long)root_id);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
 	return root;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
-					      struct btrfs_key *location)
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root)
+{
+	int ret;
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		return ret;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+				(unsigned long)root->root_key.objectid,
+				root);
+	if (ret == 0)
+		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	radix_tree_preload_end();
+
+	return ret;
+}
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *location,
+				     bool check_ref)
 {
 	struct btrfs_root *root;
 	int ret;
@@ -1221,44 +1636,41 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->dev_root;
 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
 		return fs_info->csum_root;
+	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+		return fs_info->quota_root ? fs_info->quota_root :
+					     ERR_PTR(-ENOENT);
+	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
+		return fs_info->uuid_root ? fs_info->uuid_root :
+					    ERR_PTR(-ENOENT);
 again:
-	spin_lock(&fs_info->fs_roots_radix_lock);
-	root = radix_tree_lookup(&fs_info->fs_roots_radix,
-				 (unsigned long)location->objectid);
-	spin_unlock(&fs_info->fs_roots_radix_lock);
-	if (root)
+	root = btrfs_lookup_fs_root(fs_info, location->objectid);
+	if (root) {
+		if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+			return ERR_PTR(-ENOENT);
 		return root;
+	}
 
-	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+	root = btrfs_read_fs_root(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
 
-	set_anon_super(&root->anon_super, NULL);
-
-	if (btrfs_root_refs(&root->root_item) == 0) {
+	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
 		ret = -ENOENT;
 		goto fail;
 	}
 
-	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-	if (ret < 0)
-		goto fail;
-	if (ret == 0)
-		root->orphan_item_inserted = 1;
-
-	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	ret = btrfs_init_fs_root(root);
 	if (ret)
 		goto fail;
 
-	spin_lock(&fs_info->fs_roots_radix_lock);
-	ret = radix_tree_insert(&fs_info->fs_roots_radix,
-				(unsigned long)root->root_key.objectid,
-				root);
+	ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
+			location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+	if (ret < 0)
+		goto fail;
 	if (ret == 0)
-		root->in_radix = 1;
+		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
 
-	spin_unlock(&fs_info->fs_roots_radix_lock);
-	radix_tree_preload_end();
+	ret = btrfs_insert_fs_root(fs_info, root);
 	if (ret) {
 		if (ret == -EEXIST) {
 			free_fs_root(root);
@@ -1266,51 +1678,12 @@ again:
 		}
 		goto fail;
 	}
-
-	ret = btrfs_find_dead_roots(fs_info->tree_root,
-				    root->root_key.objectid);
-	WARN_ON(ret);
 	return root;
 fail:
 	free_fs_root(root);
 	return ERR_PTR(ret);
 }
 
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-				      struct btrfs_key *location,
-				      const char *name, int namelen)
-{
-	return btrfs_read_fs_root_no_name(fs_info, location);
-#if 0
-	struct btrfs_root *root;
-	int ret;
-
-	root = btrfs_read_fs_root_no_name(fs_info, location);
-	if (!root)
-		return NULL;
-
-	if (root->in_sysfs)
-		return root;
-
-	ret = btrfs_set_root_name(root, name, namelen);
-	if (ret) {
-		free_extent_buffer(root->node);
-		kfree(root);
-		return ERR_PTR(ret);
-	}
-
-	ret = btrfs_sysfs_add_root(root);
-	if (ret) {
-		free_extent_buffer(root->node);
-		kfree(root->name);
-		kfree(root);
-		return ERR_PTR(ret);
-	}
-	root->in_sysfs = 1;
-	return root;
-#endif
-}
-
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1318,7 +1691,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 
-	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
@@ -1327,86 +1701,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 			break;
 		}
 	}
+	rcu_read_unlock();
 	return ret;
 }
 
 /*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-	struct btrfs_device *device;
-	struct btrfs_fs_info *info;
-
-	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
-		if (!device->bdev)
-			continue;
-
-		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn)
-			bdi->unplug_io_fn(bdi, page);
-	}
-}
-
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-	struct inode *inode;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
-	struct address_space *mapping;
-	u64 offset;
-
-	/* the generic O_DIRECT read code does this */
-	if (1 || !page) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	/*
-	 * page->mapping may change at any time.  Get a consistent copy
-	 * and use that for everything below
-	 */
-	smp_mb();
-	mapping = page->mapping;
-	if (!mapping)
-		return;
-
-	inode = mapping->host;
-
-	/*
-	 * don't do the expensive searching for a small number of
-	 * devices
-	 */
-	if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	offset = page_offset(page);
-
-	em_tree = &BTRFS_I(inode)->extent_tree;
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-	read_unlock(&em_tree->lock);
-	if (!em) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		free_extent_map(em);
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-	offset = offset - em->start;
-	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
-			  em->block_start + offset, page);
-	free_extent_map(em);
-}
-
-/*
  * If this fails, caller must call bdi_destroy() to get rid of the
  * bdi again.
  */
@@ -1420,48 +1719,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 		return err;
 
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
-	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
-	bdi->unplug_io_data	= info;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	return 0;
 }
 
-static int bio_ready_for_csum(struct bio *bio)
-{
-	u64 length = 0;
-	u64 buf_len = 0;
-	u64 start = 0;
-	struct page *page;
-	struct extent_io_tree *io_tree = NULL;
-	struct bio_vec *bvec;
-	int i;
-	int ret;
-
-	bio_for_each_segment(bvec, bio, i) {
-		page = bvec->bv_page;
-		if (page->private == EXTENT_PAGE_PRIVATE) {
-			length += bvec->bv_len;
-			continue;
-		}
-		if (!page->private) {
-			length += bvec->bv_len;
-			continue;
-		}
-		length = bvec->bv_len;
-		buf_len = page->private >> 2;
-		start = page_offset(page) + bvec->bv_offset;
-		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-	}
-	/* are we fully contained in this bio? */
-	if (buf_len <= length)
-		return 1;
-
-	ret = extent_range_uptodate(io_tree, start + length,
-				    start + buf_len - 1);
-	return ret;
-}
-
 /*
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
@@ -1470,48 +1732,53 @@ static void end_workqueue_fn(struct btrfs_work *work)
 {
 	struct bio *bio;
 	struct end_io_wq *end_io_wq;
-	struct btrfs_fs_info *fs_info;
 	int error;
 
 	end_io_wq = container_of(work, struct end_io_wq, work);
 	bio = end_io_wq->bio;
-	fs_info = end_io_wq->info;
 
-	/* metadata bio reads are special because the whole tree block must
-	 * be checksummed at once.  This makes sure the entire block is in
-	 * ram and up to date before trying to verify things.  For
-	 * blocksize <= pagesize, it is basically a noop
-	 */
-	if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
-	    !bio_ready_for_csum(bio)) {
-		btrfs_queue_worker(&fs_info->endio_meta_workers,
-				   &end_io_wq->work);
-		return;
-	}
 	error = end_io_wq->error;
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kfree(end_io_wq);
-	bio_endio(bio, error);
+	bio_endio_nodec(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
 {
 	struct btrfs_root *root = arg;
+	int again;
 
 	do {
-		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		again = 0;
+
+		/* Make the cleaner go to sleep early. */
+		if (btrfs_need_cleaner_sleep(root))
+			goto sleep;
+
+		if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+			goto sleep;
 
-		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
-		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
-			btrfs_run_delayed_iputs(root);
-			btrfs_clean_old_snapshots(root);
+		/*
+		 * Avoid the problem that we change the status of the fs
+		 * during the above check and trylock.
+		 */
+		if (btrfs_need_cleaner_sleep(root)) {
 			mutex_unlock(&root->fs_info->cleaner_mutex);
+			goto sleep;
 		}
 
-		if (freezing(current)) {
-			refrigerator();
-		} else {
+		btrfs_run_delayed_iputs(root);
+		again = btrfs_clean_one_deleted_snapshot(root);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		/*
+		 * The defragger has dealt with the R/O remount and umount,
+		 * needn't do anything special here.
+		 */
+		btrfs_run_defrag_inodes(root->fs_info);
+sleep:
+		if (!try_to_freeze() && !again) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop())
 				schedule();
@@ -1529,35 +1796,40 @@ static int transaction_kthread(void *arg)
 	u64 transid;
 	unsigned long now;
 	unsigned long delay;
-	int ret;
+	bool cannot_commit;
 
 	do {
-		delay = HZ * 30;
-		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		cannot_commit = false;
+		delay = HZ * root->fs_info->commit_interval;
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		spin_lock(&root->fs_info->new_trans_lock);
+		spin_lock(&root->fs_info->trans_lock);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
-			spin_unlock(&root->fs_info->new_trans_lock);
+			spin_unlock(&root->fs_info->trans_lock);
 			goto sleep;
 		}
 
 		now = get_seconds();
-		if (!cur->blocked &&
-		    (now < cur->start_time || now - cur->start_time < 30)) {
-			spin_unlock(&root->fs_info->new_trans_lock);
+		if (cur->state < TRANS_STATE_BLOCKED &&
+		    (now < cur->start_time ||
+		     now - cur->start_time < root->fs_info->commit_interval)) {
+			spin_unlock(&root->fs_info->trans_lock);
 			delay = HZ * 5;
 			goto sleep;
 		}
 		transid = cur->transid;
-		spin_unlock(&root->fs_info->new_trans_lock);
+		spin_unlock(&root->fs_info->trans_lock);
 
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
+		/* If the file system is aborted, this will always fail. */
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) != -ENOENT)
+				cannot_commit = true;
+			goto sleep;
+		}
 		if (transid == trans->transid) {
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
+			btrfs_commit_transaction(trans, root);
 		} else {
 			btrfs_end_transaction(trans, root);
 		}
@@ -1565,12 +1837,14 @@ sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
-		if (freezing(current)) {
-			refrigerator();
-		} else {
+		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+				      &root->fs_info->fs_state)))
+			btrfs_cleanup_transaction(root);
+		if (!try_to_freeze()) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop() &&
-			    !btrfs_transaction_blocked(root->fs_info))
+			    (!btrfs_transaction_blocked(root->fs_info) ||
+			     cannot_commit))
 				schedule_timeout(delay);
 			__set_current_state(TASK_RUNNING);
 		}
@@ -1578,9 +1852,291 @@ sleep:
 	return 0;
 }
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options)
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+	u64 cur;
+	int newest_index = -1;
+	struct btrfs_root_backup *root_backup;
+	int i;
+
+	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+		root_backup = info->super_copy->super_roots + i;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = i;
+	}
+
+	/* check to see if we actually wrapped around */
+	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+		root_backup = info->super_copy->super_roots;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = 0;
+	}
+	return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+				     u64 newest_gen)
+{
+	int newest_index = -1;
+
+	newest_index = find_newest_super_backup(info, newest_gen);
+	/* if there was garbage in there, just move along */
+	if (newest_index == -1) {
+		info->backup_root_index = 0;
+	} else {
+		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+	}
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+	int next_backup;
+	struct btrfs_root_backup *root_backup;
+	int last_backup;
+
+	next_backup = info->backup_root_index;
+	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+		BTRFS_NUM_BACKUP_ROOTS;
+
+	/*
+	 * just overwrite the last backup if we're at the same generation
+	 * this happens only at umount
+	 */
+	root_backup = info->super_for_commit->super_roots + last_backup;
+	if (btrfs_backup_tree_root_gen(root_backup) ==
+	    btrfs_header_generation(info->tree_root->node))
+		next_backup = last_backup;
+
+	root_backup = info->super_for_commit->super_roots + next_backup;
+
+	/*
+	 * make sure all of our padding and empty slots get zero filled
+	 * regardless of which ones we use today
+	 */
+	memset(root_backup, 0, sizeof(*root_backup));
+
+	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+	btrfs_set_backup_tree_root_gen(root_backup,
+			       btrfs_header_generation(info->tree_root->node));
+
+	btrfs_set_backup_tree_root_level(root_backup,
+			       btrfs_header_level(info->tree_root->node));
+
+	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+	btrfs_set_backup_chunk_root_gen(root_backup,
+			       btrfs_header_generation(info->chunk_root->node));
+	btrfs_set_backup_chunk_root_level(root_backup,
+			       btrfs_header_level(info->chunk_root->node));
+
+	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+	btrfs_set_backup_extent_root_gen(root_backup,
+			       btrfs_header_generation(info->extent_root->node));
+	btrfs_set_backup_extent_root_level(root_backup,
+			       btrfs_header_level(info->extent_root->node));
+
+	/*
+	 * we might commit during log recovery, which happens before we set
+	 * the fs_root.  Make sure it is valid before we fill it in.
+	 */
+	if (info->fs_root && info->fs_root->node) {
+		btrfs_set_backup_fs_root(root_backup,
+					 info->fs_root->node->start);
+		btrfs_set_backup_fs_root_gen(root_backup,
+			       btrfs_header_generation(info->fs_root->node));
+		btrfs_set_backup_fs_root_level(root_backup,
+			       btrfs_header_level(info->fs_root->node));
+	}
+
+	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+	btrfs_set_backup_dev_root_gen(root_backup,
+			       btrfs_header_generation(info->dev_root->node));
+	btrfs_set_backup_dev_root_level(root_backup,
+				       btrfs_header_level(info->dev_root->node));
+
+	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+	btrfs_set_backup_csum_root_gen(root_backup,
+			       btrfs_header_generation(info->csum_root->node));
+	btrfs_set_backup_csum_root_level(root_backup,
+			       btrfs_header_level(info->csum_root->node));
+
+	btrfs_set_backup_total_bytes(root_backup,
+			     btrfs_super_total_bytes(info->super_copy));
+	btrfs_set_backup_bytes_used(root_backup,
+			     btrfs_super_bytes_used(info->super_copy));
+	btrfs_set_backup_num_devices(root_backup,
+			     btrfs_super_num_devices(info->super_copy));
+
+	/*
+	 * if we don't copy this out to the super_copy, it won't get remembered
+	 * for the next commit
+	 */
+	memcpy(&info->super_copy->super_roots,
+	       &info->super_for_commit->super_roots,
+	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+				     struct btrfs_super_block *super,
+				     int *num_backups_tried, int *backup_index)
+{
+	struct btrfs_root_backup *root_backup;
+	int newest = *backup_index;
+
+	if (*num_backups_tried == 0) {
+		u64 gen = btrfs_super_generation(super);
+
+		newest = find_newest_super_backup(info, gen);
+		if (newest == -1)
+			return -1;
+
+		*backup_index = newest;
+		*num_backups_tried = 1;
+	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+		/* we've tried all the backups, all done */
+		return -1;
+	} else {
+		/* jump to the next oldest backup */
+		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+			BTRFS_NUM_BACKUP_ROOTS;
+		*backup_index = newest;
+		*num_backups_tried += 1;
+	}
+	root_backup = super->super_roots + newest;
+
+	btrfs_set_super_generation(super,
+				   btrfs_backup_tree_root_gen(root_backup));
+	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+	btrfs_set_super_root_level(super,
+				   btrfs_backup_tree_root_level(root_backup));
+	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+	/*
+	 * fixme: the total bytes and num_devices need to match or we should
+	 * need a fsck
+	 */
+	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+	return 0;
+}
+
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+	btrfs_destroy_workqueue(fs_info->fixup_workers);
+	btrfs_destroy_workqueue(fs_info->delalloc_workers);
+	btrfs_destroy_workqueue(fs_info->workers);
+	btrfs_destroy_workqueue(fs_info->endio_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+	btrfs_destroy_workqueue(fs_info->rmw_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+	btrfs_destroy_workqueue(fs_info->submit_workers);
+	btrfs_destroy_workqueue(fs_info->delayed_workers);
+	btrfs_destroy_workqueue(fs_info->caching_workers);
+	btrfs_destroy_workqueue(fs_info->readahead_workers);
+	btrfs_destroy_workqueue(fs_info->flush_workers);
+	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+	btrfs_destroy_workqueue(fs_info->extent_workers);
+}
+
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+	if (root) {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		root->node = NULL;
+		root->commit_root = NULL;
+	}
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+	free_root_extent_buffers(info->tree_root);
+
+	free_root_extent_buffers(info->dev_root);
+	free_root_extent_buffers(info->extent_root);
+	free_root_extent_buffers(info->csum_root);
+	free_root_extent_buffers(info->quota_root);
+	free_root_extent_buffers(info->uuid_root);
+	if (chunk_root)
+		free_root_extent_buffers(info->chunk_root);
+}
+
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while (!list_empty(&fs_info->dead_roots)) {
+		gang[0] = list_entry(fs_info->dead_roots.next,
+				     struct btrfs_root, root_list);
+		list_del(&gang[0]->root_list);
+
+		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
+			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+		} else {
+			free_extent_buffer(gang[0]->node);
+			free_extent_buffer(gang[0]->commit_root);
+			btrfs_put_fs_root(gang[0]);
+		}
+	}
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+	}
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		btrfs_free_log_root_tree(NULL, fs_info);
+		btrfs_destroy_pinned_extent(fs_info->tree_root,
+					    fs_info->pinned_extents);
+	}
+}
+
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -1591,25 +2147,28 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u64 features;
 	struct btrfs_key location;
 	struct buffer_head *bh;
-	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
-	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
-	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = tree_root->fs_info;
-	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
-						GFP_NOFS);
-	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
-					      GFP_NOFS);
+	struct btrfs_super_block *disk_super;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *tree_root;
+	struct btrfs_root *extent_root;
+	struct btrfs_root *csum_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
 	struct btrfs_root *log_tree_root;
-
 	int ret;
 	int err = -EINVAL;
-
-	struct btrfs_super_block *disk_super;
-
-	if (!extent_root || !tree_root || !fs_info ||
-	    !chunk_root || !dev_root || !csum_root) {
+	int num_backups_tried = 0;
+	int backup_index = 0;
+	int max_active;
+	int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+	bool create_uuid_tree;
+	bool check_uuid_tree;
+
+	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+	if (!tree_root || !chunk_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -1626,63 +2185,129 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_srcu;
 	}
 
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+	if (ret) {
+		err = ret;
+		goto fail_bdi;
+	}
+	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+					(1 + ilog2(nr_cpu_ids));
+
+	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+	if (ret) {
+		err = ret;
+		goto fail_dirty_metadata_bytes;
+	}
+
+	ret = percpu_counter_init(&fs_info->bio_counter, 0);
+	if (ret) {
+		err = ret;
+		goto fail_delalloc_bytes;
+	}
+
 	fs_info->btree_inode = new_inode(sb);
 	if (!fs_info->btree_inode) {
 		err = -ENOMEM;
-		goto fail_bdi;
+		goto fail_bio_counter;
 	}
 
+	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
-	INIT_LIST_HEAD(&fs_info->hashers);
-	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	INIT_LIST_HEAD(&fs_info->ordered_operations);
+	INIT_LIST_HEAD(&fs_info->delalloc_roots);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
-	spin_lock_init(&fs_info->delalloc_lock);
-	spin_lock_init(&fs_info->new_trans_lock);
-	spin_lock_init(&fs_info->ref_cache_lock);
+	spin_lock_init(&fs_info->delalloc_root_lock);
+	spin_lock_init(&fs_info->trans_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
+	spin_lock_init(&fs_info->defrag_inodes_lock);
+	spin_lock_init(&fs_info->free_chunk_lock);
+	spin_lock_init(&fs_info->tree_mod_seq_lock);
+	spin_lock_init(&fs_info->super_lock);
+	spin_lock_init(&fs_info->qgroup_op_lock);
+	spin_lock_init(&fs_info->buffer_lock);
+	rwlock_init(&fs_info->tree_mod_log_lock);
+	mutex_init(&fs_info->reloc_mutex);
+	mutex_init(&fs_info->delalloc_root_mutex);
+	seqlock_init(&fs_info->profiles_lock);
 
 	init_completion(&fs_info->kobj_unregister);
-	fs_info->tree_root = tree_root;
-	fs_info->extent_root = extent_root;
-	fs_info->csum_root = csum_root;
-	fs_info->chunk_root = chunk_root;
-	fs_info->dev_root = dev_root;
-	fs_info->fs_devices = fs_devices;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
+	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	btrfs_init_block_rsv(&fs_info->global_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
-	mutex_init(&fs_info->durable_block_rsv_mutex);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv,
+			     BTRFS_BLOCK_RSV_GLOBAL);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+			     BTRFS_BLOCK_RSV_DELALLOC);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+			     BTRFS_BLOCK_RSV_DELOPS);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
+	atomic_set(&fs_info->defrag_running, 0);
+	atomic_set(&fs_info->qgroup_op_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->sb = sb;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
+	fs_info->defrag_inodes = RB_ROOT;
+	fs_info->free_chunk_space = 0;
+	fs_info->tree_mod_log = RB_ROOT;
+	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+	fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
+	/* readahead state */
+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+	spin_lock_init(&fs_info->reada_lock);
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
 
-	INIT_LIST_HEAD(&fs_info->ordered_extents);
-	spin_lock_init(&fs_info->ordered_extent_lock);
+	INIT_LIST_HEAD(&fs_info->ordered_roots);
+	spin_lock_init(&fs_info->ordered_root_lock);
+	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+					GFP_NOFS);
+	if (!fs_info->delayed_root) {
+		err = -ENOMEM;
+		goto fail_iput;
+	}
+	btrfs_init_delayed_root(fs_info->delayed_root);
+
+	mutex_init(&fs_info->scrub_lock);
+	atomic_set(&fs_info->scrubs_running, 0);
+	atomic_set(&fs_info->scrub_pause_req, 0);
+	atomic_set(&fs_info->scrubs_paused, 0);
+	atomic_set(&fs_info->scrub_cancel_req, 0);
+	init_waitqueue_head(&fs_info->replace_wait);
+	init_waitqueue_head(&fs_info->scrub_pause_wait);
+	fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	fs_info->check_integrity_print_mask = 0;
+#endif
+
+	spin_lock_init(&fs_info->balance_lock);
+	mutex_init(&fs_info->balance_mutex);
+	atomic_set(&fs_info->balance_running, 0);
+	atomic_set(&fs_info->balance_pause_req, 0);
+	atomic_set(&fs_info->balance_cancel_req, 0);
+	fs_info->balance_ctl = NULL;
+	init_waitqueue_head(&fs_info->balance_wait_q);
+	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
 
 	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
-	fs_info->btree_inode->i_nlink = 1;
+	set_nlink(fs_info->btree_inode, 1);
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
 	 * the real end of the address space is determined by all of
@@ -1694,40 +2319,58 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-			     fs_info->btree_inode->i_mapping,
-			     GFP_NOFS);
-	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
-			     GFP_NOFS);
+			     fs_info->btree_inode->i_mapping);
+	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
 
 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
-	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
-	insert_inode_hash(fs_info->btree_inode);
+	set_bit(BTRFS_INODE_DUMMY,
+		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
+	btrfs_insert_inode_hash(fs_info->btree_inode);
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
+	fs_info->first_logical_byte = (u64)-1;
 
 	extent_io_tree_init(&fs_info->freed_extents[0],
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+			     fs_info->btree_inode->i_mapping);
 	extent_io_tree_init(&fs_info->freed_extents[1],
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+			     fs_info->btree_inode->i_mapping);
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	fs_info->do_barriers = 1;
 
 
-	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->ordered_operations_mutex);
+	mutex_init(&fs_info->ordered_extent_flush_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
-	init_rwsem(&fs_info->extent_commit_sem);
+	init_rwsem(&fs_info->commit_root_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
+	fs_info->dev_replace.lock_owner = 0;
+	atomic_set(&fs_info->dev_replace.nesting_level, 0);
+	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+	mutex_init(&fs_info->dev_replace.lock_management_lock);
+	mutex_init(&fs_info->dev_replace.lock);
+
+	spin_lock_init(&fs_info->qgroup_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
+	fs_info->qgroup_tree = RB_ROOT;
+	fs_info->qgroup_op_tree = RB_ROOT;
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	fs_info->qgroup_seq = 1;
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	fs_info->qgroup_ulist = NULL;
+	mutex_init(&fs_info->qgroup_rescan_lock);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1737,35 +2380,80 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
+	ret = btrfs_alloc_stripe_hash_table(fs_info);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
+	invalidate_bdev(fs_devices->latest_bdev);
+
+	/*
+	 * Read super block and check the signature bytes only
+	 */
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh) {
 		err = -EINVAL;
-		goto fail_iput;
+		goto fail_alloc;
 	}
 
-	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
-	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
-	       sizeof(fs_info->super_for_commit));
+	/*
+	 * We want to check superblock checksum, the type is stored inside.
+	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+	 */
+	if (btrfs_check_super_csum(bh->b_data)) {
+		printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	/*
+	 * super_copy is zeroed at allocation time and we never touch the
+	 * following bytes up to INFO_SIZE, the checksum is calculated from
+	 * the whole block of INFO_SIZE
+	 */
+	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
+	       sizeof(*fs_info->super_for_commit));
 	brelse(bh);
 
-	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+
+	ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
-		goto fail_iput;
+		goto fail_alloc;
 
 	/* check FS state, whether FS is broken. */
-	fs_info->fs_state |= btrfs_super_flags(disk_super);
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
-	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+	/*
+	 * run through our array of backup supers and setup
+	 * our ring pointer to the oldest one
+	 */
+	generation = btrfs_super_generation(disk_super);
+	find_oldest_super_backup(fs_info, generation);
+
+	/*
+	 * In the long term, we'll store the compression type in the super
+	 * block, and it'll be used for per file compression control.
+	 */
+	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
 
 	ret = btrfs_parse_options(tree_root, options);
 	if (ret) {
 		err = ret;
-		goto fail_iput;
+		goto fail_alloc;
 	}
 
 	features = btrfs_super_incompat_flags(disk_super) &
@@ -1773,15 +2461,69 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (features) {
 		printk(KERN_ERR "BTRFS: couldn't mount because of "
 		       "unsupported optional features (%Lx).\n",
-		       (unsigned long long)features);
+		       features);
 		err = -EINVAL;
-		goto fail_iput;
+		goto fail_alloc;
+	}
+
+	if (btrfs_super_leafsize(disk_super) !=
+	    btrfs_super_nodesize(disk_super)) {
+		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+		       "blocksizes don't match.  node %d leaf %d\n",
+		       btrfs_super_nodesize(disk_super),
+		       btrfs_super_leafsize(disk_super));
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+	if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+		       "blocksize (%d) was too large\n",
+		       btrfs_super_leafsize(disk_super));
+		err = -EINVAL;
+		goto fail_alloc;
 	}
 
 	features = btrfs_super_incompat_flags(disk_super);
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-	if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+		printk(KERN_ERR "BTRFS: has skinny extents\n");
+
+	/*
+	 * flag our filesystem as having big metadata blocks if
+	 * they are bigger than the page size
+	 */
+	if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+			printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
+		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+	}
+
+	nodesize = btrfs_super_nodesize(disk_super);
+	leafsize = btrfs_super_leafsize(disk_super);
+	sectorsize = btrfs_super_sectorsize(disk_super);
+	stripesize = btrfs_super_stripesize(disk_super);
+	fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+	/*
+	 * mixed block groups end up with duplicate but slightly offset
+	 * extent buffers for the same range.  It leads to corruptions
+	 */
+	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+	    (sectorsize != leafsize)) {
+		printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+				"are not allowed for mixed block groups on %s\n",
+				sb->s_id);
+		goto fail_alloc;
+	}
+
+	/*
+	 * Needn't use the lock because there is no other task which will
+	 * update the flag.
+	 */
 	btrfs_set_super_incompat_flags(disk_super, features);
 
 	features = btrfs_super_compat_ro_flags(disk_super) &
@@ -1789,85 +2531,86 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
 		       "unsupported option features (%Lx).\n",
-		       (unsigned long long)features);
+		       features);
 		err = -EINVAL;
-		goto fail_iput;
+		goto fail_alloc;
 	}
 
-	btrfs_init_workers(&fs_info->generic_worker,
-			   "genwork", 1, NULL);
+	max_active = fs_info->thread_pool_size;
 
-	btrfs_init_workers(&fs_info->workers, "worker",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	fs_info->workers =
+		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+				      max_active, 16);
 
-	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	fs_info->delalloc_workers =
+		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
 
-	btrfs_init_workers(&fs_info->submit_workers, "submit",
-			   min_t(u64, fs_devices->num_devices,
-			   fs_info->thread_pool_size),
-			   &fs_info->generic_worker);
+	fs_info->flush_workers =
+		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
 
-	/* a higher idle thresh on the submit workers makes it much more
+	fs_info->caching_workers =
+		btrfs_alloc_workqueue("cache", flags, max_active, 0);
+
+	/*
+	 * a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
 	 * devices
 	 */
-	fs_info->submit_workers.idle_thresh = 64;
-
-	fs_info->workers.idle_thresh = 16;
-	fs_info->workers.ordered = 1;
-
-	fs_info->delalloc_workers.idle_thresh = 2;
-	fs_info->delalloc_workers.ordered = 1;
-
-	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_workers, "endio",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_meta_write_workers,
-			   "endio-meta-write", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
-			   1, &fs_info->generic_worker);
+	fs_info->submit_workers =
+		btrfs_alloc_workqueue("submit", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 64);
+
+	fs_info->fixup_workers =
+		btrfs_alloc_workqueue("fixup", flags, 1, 0);
 
 	/*
 	 * endios are largely parallel and should have a very
 	 * low idle thresh
 	 */
-	fs_info->endio_workers.idle_thresh = 4;
-	fs_info->endio_meta_workers.idle_thresh = 4;
-
-	fs_info->endio_write_workers.idle_thresh = 2;
-	fs_info->endio_meta_write_workers.idle_thresh = 2;
-
-	btrfs_start_workers(&fs_info->workers, 1);
-	btrfs_start_workers(&fs_info->generic_worker, 1);
-	btrfs_start_workers(&fs_info->submit_workers, 1);
-	btrfs_start_workers(&fs_info->delalloc_workers, 1);
-	btrfs_start_workers(&fs_info->fixup_workers, 1);
-	btrfs_start_workers(&fs_info->endio_workers, 1);
-	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
-	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
-	btrfs_start_workers(&fs_info->endio_write_workers, 1);
-	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+	fs_info->endio_workers =
+		btrfs_alloc_workqueue("endio", flags, max_active, 4);
+	fs_info->endio_meta_workers =
+		btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+	fs_info->endio_meta_write_workers =
+		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+	fs_info->endio_raid56_workers =
+		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+	fs_info->rmw_workers =
+		btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+	fs_info->endio_write_workers =
+		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+	fs_info->endio_freespace_worker =
+		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+	fs_info->delayed_workers =
+		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+	fs_info->readahead_workers =
+		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+	fs_info->qgroup_rescan_workers =
+		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+	fs_info->extent_workers =
+		btrfs_alloc_workqueue("extent-refs", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 8);
+
+	if (!(fs_info->workers && fs_info->delalloc_workers &&
+	      fs_info->submit_workers && fs_info->flush_workers &&
+	      fs_info->endio_workers && fs_info->endio_meta_workers &&
+	      fs_info->endio_meta_write_workers &&
+	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+	      fs_info->caching_workers && fs_info->readahead_workers &&
+	      fs_info->fixup_workers && fs_info->delayed_workers &&
+	      fs_info->fixup_workers && fs_info->extent_workers &&
+	      fs_info->qgroup_rescan_workers)) {
+		err = -ENOMEM;
+		goto fail_sb_buffer;
+	}
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
 				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
 
-	nodesize = btrfs_super_nodesize(disk_super);
-	leafsize = btrfs_super_leafsize(disk_super);
-	sectorsize = btrfs_super_sectorsize(disk_super);
-	stripesize = btrfs_super_stripesize(disk_super);
 	tree_root->nodesize = nodesize;
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
@@ -1876,9 +2619,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic))) {
-		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+		printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	if (sectorsize != PAGE_SIZE) {
+		printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
 		goto fail_sb_buffer;
 	}
 
@@ -1886,7 +2634,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk(KERN_WARNING "btrfs: failed to read the system "
+		printk(KERN_WARNING "BTRFS: failed to read the system "
 		       "array on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
@@ -1901,30 +2649,38 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
-	BUG_ON(!chunk_root->node);
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
-		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+	if (!chunk_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+		printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-	   BTRFS_UUID_SIZE);
+	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
 
-	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
-	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 
-	btrfs_close_extra_devices(fs_devices);
+	/*
+	 * keep the device that is marked to be the target device for the
+	 * dev_replace procedure
+	 */
+	btrfs_close_extra_devices(fs_info, fs_devices, 0);
+
+	if (!fs_devices->latest_bdev) {
+		printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+		       sb->s_id);
+		goto fail_tree_roots;
+	}
 
+retry_root_backup:
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 	generation = btrfs_super_generation(disk_super);
@@ -1932,51 +2688,129 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize, generation);
-	if (!tree_root->node)
-		goto fail_chunk_root;
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
-		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+	if (!tree_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+		printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
 		       sb->s_id);
-		goto fail_tree_root;
+
+		goto recovery_tree_root;
 	}
+
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
+	btrfs_set_root_refs(&tree_root->root_item, 1);
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-	if (ret)
-		goto fail_tree_root;
-	extent_root->track_dirty = 1;
+	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = 0;
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_DEV_TREE_OBJECTID, dev_root);
-	if (ret)
-		goto fail_extent_root;
-	dev_root->track_dirty = 1;
+	extent_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(extent_root)) {
+		ret = PTR_ERR(extent_root);
+		goto recovery_tree_root;
+	}
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
+	fs_info->extent_root = extent_root;
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
-	if (ret)
-		goto fail_dev_root;
+	location.objectid = BTRFS_DEV_TREE_OBJECTID;
+	dev_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(dev_root)) {
+		ret = PTR_ERR(dev_root);
+		goto recovery_tree_root;
+	}
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
+	fs_info->dev_root = dev_root;
+	btrfs_init_devices_late(fs_info);
 
-	csum_root->track_dirty = 1;
+	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+	csum_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(csum_root)) {
+		ret = PTR_ERR(csum_root);
+		goto recovery_tree_root;
+	}
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
+	fs_info->csum_root = csum_root;
+
+	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+	quota_root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(quota_root)) {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
+		fs_info->quota_enabled = 1;
+		fs_info->pending_quota_state = 1;
+		fs_info->quota_root = quota_root;
+	}
+
+	location.objectid = BTRFS_UUID_TREE_OBJECTID;
+	uuid_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(uuid_root)) {
+		ret = PTR_ERR(uuid_root);
+		if (ret != -ENOENT)
+			goto recovery_tree_root;
+		create_uuid_tree = true;
+		check_uuid_tree = false;
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
+		fs_info->uuid_root = uuid_root;
+		create_uuid_tree = false;
+		check_uuid_tree =
+		    generation != btrfs_super_uuid_tree_generation(disk_super);
+	}
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
-	fs_info->data_alloc_profile = (u64)-1;
-	fs_info->metadata_alloc_profile = (u64)-1;
-	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
-	ret = btrfs_read_block_groups(extent_root);
+	ret = btrfs_recover_balance(fs_info);
+	if (ret) {
+		printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_dev_stats(fs_info);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
+		       ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_dev_replace(fs_info);
 	if (ret) {
-		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+		pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
 		goto fail_block_groups;
 	}
 
+	btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
+	ret = btrfs_sysfs_add_one(fs_info);
+	if (ret) {
+		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_space_info(fs_info);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+		goto fail_sysfs;
+	}
+
+	ret = btrfs_read_block_groups(extent_root);
+	if (ret) {
+		printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+		goto fail_sysfs;
+	}
+	fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	if (fs_info->fs_devices->missing_devices >
+	     fs_info->num_tolerated_disk_barrier_failures &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_WARNING "BTRFS: "
+			"too many missing devices, writeable mount is not allowed\n");
+		goto fail_sysfs;
+	}
+
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (IS_ERR(fs_info->cleaner_kthread))
-		goto fail_block_groups;
+		goto fail_sysfs;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -1987,30 +2821,49 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_test_opt(tree_root, SSD) &&
 	    !btrfs_test_opt(tree_root, NOSSD) &&
 	    !fs_info->fs_devices->rotating) {
-		printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+		printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
 		       "mode\n");
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
+	/* Set the real inode map cache flag */
+	if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
+		btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+		ret = btrfsic_mount(tree_root, fs_devices,
+				    btrfs_test_opt(tree_root,
+					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+				    1 : 0,
+				    fs_info->check_integrity_print_mask);
+		if (ret)
+			printk(KERN_WARNING "BTRFS: failed to initialize"
+			       " integrity check module %s\n", sb->s_id);
+	}
+#endif
+	ret = btrfs_read_qgroup_config(fs_info);
+	if (ret)
+		goto fail_trans_kthread;
+
 	/* do not make disk changes in broken FS */
-	if (btrfs_super_log_root(disk_super) != 0 &&
-	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+	if (btrfs_super_log_root(disk_super) != 0) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
-			printk(KERN_WARNING "Btrfs log replay required "
+			printk(KERN_WARNING "BTRFS: log replay required "
 			       "on RO media\n");
 			err = -EIO;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 		blocksize =
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		log_tree_root = btrfs_alloc_root(fs_info);
 		if (!log_tree_root) {
 			err = -ENOMEM;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2019,54 +2872,119 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		log_tree_root->node = read_tree_block(tree_root, bytenr,
 						      blocksize,
 						      generation + 1);
+		if (!log_tree_root->node ||
+		    !extent_buffer_uptodate(log_tree_root->node)) {
+			printk(KERN_ERR "BTRFS: failed to read log tree\n");
+			free_extent_buffer(log_tree_root->node);
+			kfree(log_tree_root);
+			goto fail_qgroup;
+		}
+		/* returns with log_tree_root freed on success */
 		ret = btrfs_recover_log_trees(log_tree_root);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_error(tree_root->fs_info, ret,
+				    "Failed to recover log tree");
+			free_extent_buffer(log_tree_root->node);
+			kfree(log_tree_root);
+			goto fail_qgroup;
+		}
 
 		if (sb->s_flags & MS_RDONLY) {
-			ret =  btrfs_commit_super(tree_root);
-			BUG_ON(ret);
+			ret = btrfs_commit_super(tree_root);
+			if (ret)
+				goto fail_qgroup;
 		}
 	}
 
 	ret = btrfs_find_orphan_roots(tree_root);
-	BUG_ON(ret);
+	if (ret)
+		goto fail_qgroup;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_cleanup_fs_roots(fs_info);
-		BUG_ON(ret);
+		if (ret)
+			goto fail_qgroup;
 
+		mutex_lock(&fs_info->cleaner_mutex);
 		ret = btrfs_recover_relocation(tree_root);
+		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			printk(KERN_WARNING
-			       "btrfs: failed to recover relocation\n");
+			       "BTRFS: failed to recover relocation\n");
 			err = -EINVAL;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 	}
 
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
-	location.offset = (u64)-1;
+	location.offset = 0;
 
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
-	if (!fs_info->fs_root)
-		goto fail_trans_kthread;
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
-		goto fail_trans_kthread;
+		goto fail_qgroup;
 	}
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		down_read(&fs_info->cleanup_work_sem);
-		btrfs_orphan_cleanup(fs_info->fs_root);
-		btrfs_orphan_cleanup(fs_info->tree_root);
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_read(&fs_info->cleanup_work_sem);
+	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
 		up_read(&fs_info->cleanup_work_sem);
+		close_ctree(tree_root);
+		return ret;
+	}
+	up_read(&fs_info->cleanup_work_sem);
+
+	ret = btrfs_resume_balance_async(fs_info);
+	if (ret) {
+		printk(KERN_WARNING "BTRFS: failed to resume balance\n");
+		close_ctree(tree_root);
+		return ret;
 	}
 
-	return tree_root;
+	ret = btrfs_resume_dev_replace_async(fs_info);
+	if (ret) {
+		pr_warn("BTRFS: failed to resume dev_replace\n");
+		close_ctree(tree_root);
+		return ret;
+	}
+
+	btrfs_qgroup_rescan_resume(fs_info);
+
+	if (create_uuid_tree) {
+		pr_info("BTRFS: creating UUID tree\n");
+		ret = btrfs_create_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("BTRFS: failed to create the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else if (check_uuid_tree ||
+		   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+		pr_info("BTRFS: checking UUID tree\n");
+		ret = btrfs_check_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("BTRFS: failed to check the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else {
+		fs_info->update_uuid_tree_gen = 1;
+	}
 
+	return 0;
+
+fail_qgroup:
+	btrfs_free_qgroup_config(fs_info);
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
+	btrfs_cleanup_transaction(fs_info->tree_root);
+	btrfs_free_fs_roots(fs_info);
 fail_cleaner:
 	kthread_stop(fs_info->cleaner_kthread);
 
@@ -2075,71 +2993,75 @@ fail_cleaner:
 	 * kthreads
 	 */
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_sysfs:
+	btrfs_sysfs_remove_one(fs_info);
 
 fail_block_groups:
+	btrfs_put_block_group_cache(fs_info);
 	btrfs_free_block_groups(fs_info);
-	free_extent_buffer(csum_root->node);
-	free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
-	free_extent_buffer(dev_root->node);
-	free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
-	free_extent_buffer(extent_root->node);
-	free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
-	free_extent_buffer(tree_root->node);
-	free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
-	free_extent_buffer(chunk_root->node);
-	free_extent_buffer(chunk_root->commit_root);
-fail_sb_buffer:
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
-fail_iput:
+
+fail_tree_roots:
+	free_root_pointers(fs_info, 1);
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
-	iput(fs_info->btree_inode);
 
-	btrfs_close_devices(fs_info->fs_devices);
+fail_sb_buffer:
+	btrfs_stop_all_workers(fs_info);
+fail_alloc:
+fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+	iput(fs_info->btree_inode);
+fail_bio_counter:
+	percpu_counter_destroy(&fs_info->bio_counter);
+fail_delalloc_bytes:
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
-	kfree(extent_root);
-	kfree(tree_root);
-	kfree(fs_info);
-	kfree(chunk_root);
-	kfree(dev_root);
-	kfree(csum_root);
-	return ERR_PTR(err);
+	btrfs_free_stripe_hash_table(fs_info);
+	btrfs_close_devices(fs_info->fs_devices);
+	return err;
+
+recovery_tree_root:
+	if (!btrfs_test_opt(tree_root, RECOVERY))
+		goto fail_tree_roots;
+
+	free_root_pointers(fs_info, 0);
+
+	/* don't use the log in recovery mode, it won't be valid */
+	btrfs_set_super_log_root(disk_super, 0);
+
+	/* we can't trust the free space cache either */
+	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+	ret = next_root_backup(fs_info, fs_info->super_copy,
+			       &num_backups_tried, &backup_index);
+	if (ret == -1)
+		goto fail_block_groups;
+	goto retry_root_backup;
 }
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
-		}
+		struct btrfs_device *device = (struct btrfs_device *)
+			bh->b_private;
+
+		printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
+					  "I/O error on %s\n",
+					  rcu_str_deref(device->name));
 		/* note, we dont' set_buffer_write_io_error because we have
 		 * our own ways of dealing with the IO errors
 		 */
 		clear_buffer_uptodate(bh);
+		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
 	}
 	unlock_buffer(bh);
 	put_bh(bh);
@@ -2161,16 +3083,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	 */
 	for (i = 0; i < 1; i++) {
 		bytenr = btrfs_sb_offset(i);
-		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+					i_size_read(bdev->bd_inode))
 			break;
-		bh = __bread(bdev, bytenr / 4096, 4096);
+		bh = __bread(bdev, bytenr / 4096,
+					BTRFS_SUPER_INFO_SIZE);
 		if (!bh)
 			continue;
 
 		super = (struct btrfs_super_block *)bh->b_data;
 		if (btrfs_super_bytenr(super) != bytenr ||
-		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
-			    sizeof(super->magic))) {
+		    btrfs_super_magic(super) != BTRFS_MAGIC) {
 			brelse(bh);
 			continue;
 		}
@@ -2207,22 +3130,10 @@ static int write_dev_supers(struct btrfs_device *device,
 	int errors = 0;
 	u32 crc;
 	u64 bytenr;
-	int last_barrier = 0;
 
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
-	/* make sure only the last submit_bh does a barrier */
-	if (do_barriers) {
-		for (i = 0; i < max_mirrors; i++) {
-			bytenr = btrfs_sb_offset(i);
-			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-			    device->total_bytes)
-				break;
-			last_barrier = i;
-		}
-	}
-
 	for (i = 0; i < max_mirrors; i++) {
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2231,7 +3142,10 @@ static int write_dev_supers(struct btrfs_device *device,
 		if (wait) {
 			bh = __find_get_block(device->bdev, bytenr / 4096,
 					      BTRFS_SUPER_INFO_SIZE);
-			BUG_ON(!bh);
+			if (!bh) {
+				errors++;
+				continue;
+			}
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				errors++;
@@ -2246,7 +3160,7 @@ static int write_dev_supers(struct btrfs_device *device,
 			btrfs_set_super_bytenr(sb, bytenr);
 
 			crc = ~(u32)0;
-			crc = btrfs_csum_data(NULL, (char *)sb +
+			crc = btrfs_csum_data((char *)sb +
 					      BTRFS_CSUM_SIZE, crc,
 					      BTRFS_SUPER_INFO_SIZE -
 					      BTRFS_CSUM_SIZE);
@@ -2258,6 +3172,13 @@ static int write_dev_supers(struct btrfs_device *device,
 			 */
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
+			if (!bh) {
+				printk(KERN_ERR "BTRFS: couldn't get super "
+				       "buffer head for bytenr %Lu\n", bytenr);
+				errors++;
+				continue;
+			}
+
 			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
 			/* one reference for submit_bh */
@@ -2266,20 +3187,228 @@ static int write_dev_supers(struct btrfs_device *device,
 			set_buffer_uptodate(bh);
 			lock_buffer(bh);
 			bh->b_end_io = btrfs_end_buffer_write_sync;
+			bh->b_private = device;
 		}
 
-		if (i == last_barrier && do_barriers)
-			ret = submit_bh(WRITE_FLUSH_FUA, bh);
+		/*
+		 * we fua the first super.  The others we allow
+		 * to go down lazy.
+		 */
+		if (i == 0)
+			ret = btrfsic_submit_bh(WRITE_FUA, bh);
 		else
-			ret = submit_bh(WRITE_SYNC, bh);
-
+			ret = btrfsic_submit_bh(WRITE_SYNC, bh);
 		if (ret)
 			errors++;
 	}
 	return errors < i ? 0 : -1;
 }
 
-int write_all_supers(struct btrfs_root *root, int max_mirrors)
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	if (device->nobarriers)
+		return 0;
+
+	if (wait) {
+		bio = device->flush_bio;
+		if (!bio)
+			return 0;
+
+		wait_for_completion(&device->flush_wait);
+
+		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
+				      rcu_str_deref(device->name));
+			device->nobarriers = 1;
+		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
+			ret = -EIO;
+			btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
+		}
+
+		/* drop the reference from the wait == 0 run */
+		bio_put(bio);
+		device->flush_bio = NULL;
+
+		return ret;
+	}
+
+	/*
+	 * one reference for us, and we leave it for the
+	 * caller
+	 */
+	device->flush_bio = NULL;
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = btrfs_end_empty_barrier;
+	bio->bi_bdev = device->bdev;
+	init_completion(&device->flush_wait);
+	bio->bi_private = &device->flush_wait;
+	device->flush_bio = bio;
+
+	bio_get(bio);
+	btrfsic_submit_bio(WRITE_FLUSH, bio);
+
+	return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+	struct list_head *head;
+	struct btrfs_device *dev;
+	int errors_send = 0;
+	int errors_wait = 0;
+	int ret;
+
+	/* send down all the barriers */
+	head = &info->fs_devices->devices;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
+		if (!dev->bdev) {
+			errors_send++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 0);
+		if (ret)
+			errors_send++;
+	}
+
+	/* wait for all the barriers */
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
+		if (!dev->bdev) {
+			errors_wait++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 1);
+		if (ret)
+			errors_wait++;
+	}
+	if (errors_send > info->num_tolerated_disk_barrier_failures ||
+	    errors_wait > info->num_tolerated_disk_barrier_failures)
+		return -EIO;
+	return 0;
+}
+
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_space_info *sinfo;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int i;
+	int c;
+	int num_tolerated_disk_barrier_failures =
+		(int)fs_info->fs_devices->num_devices;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		sinfo = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+			if (tmp->flags == types[i]) {
+				sinfo = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!sinfo)
+			continue;
+
+		down_read(&sinfo->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&sinfo->block_groups[c])) {
+				u64 flags;
+
+				btrfs_get_block_group_info(
+					&sinfo->block_groups[c], &space);
+				if (space.total_bytes == 0 ||
+				    space.used_bytes == 0)
+					continue;
+				flags = space.flags;
+				/*
+				 * return
+				 * 0: if dup, single or RAID0 is configured for
+				 *    any of metadata, system or data, else
+				 * 1: if RAID5 is configured, or if RAID1 or
+				 *    RAID10 is configured and only two mirrors
+				 *    are used, else
+				 * 2: if RAID6 is configured, else
+				 * num_mirrors - 1: if RAID1 or RAID10 is
+				 *                  configured and more than
+				 *                  2 mirrors are used.
+				 */
+				if (num_tolerated_disk_barrier_failures > 0 &&
+				    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+					       BTRFS_BLOCK_GROUP_RAID0)) ||
+				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+				      == 0)))
+					num_tolerated_disk_barrier_failures = 0;
+				else if (num_tolerated_disk_barrier_failures > 1) {
+					if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					    BTRFS_BLOCK_GROUP_RAID5 |
+					    BTRFS_BLOCK_GROUP_RAID10)) {
+						num_tolerated_disk_barrier_failures = 1;
+					} else if (flags &
+						   BTRFS_BLOCK_GROUP_RAID6) {
+						num_tolerated_disk_barrier_failures = 2;
+					}
+				}
+			}
+		}
+		up_read(&sinfo->groups_sem);
+	}
+
+	return num_tolerated_disk_barrier_failures;
+}
+
+static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
@@ -2291,15 +3420,28 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	int total_errors = 0;
 	u64 flags;
 
-	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+	backup_super_roots(root->fs_info);
 
-	sb = &root->fs_info->super_for_commit;
+	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
-	list_for_each_entry(dev, head, dev_list) {
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+
+	if (do_barriers) {
+		ret = barrier_all_devices(root->fs_info);
+		if (ret) {
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			btrfs_error(root->fs_info, ret,
+				    "errors while submitting device barriers.");
+			return ret;
+		}
+	}
+
+	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
 			continue;
@@ -2326,13 +3468,18 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		btrfs_err(root->fs_info, "%d errors while writing supers",
 		       total_errors);
-		BUG();
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+		/* FUA is masked off if unsupported and can't be the reason */
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
 	}
 
 	total_errors = 0;
-	list_for_each_entry(dev, head, dev_list) {
+	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2344,9 +3491,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	}
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
-		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
-		       total_errors);
-		BUG();
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
 	}
 	return 0;
 }
@@ -2354,13 +3501,12 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors)
 {
-	int ret;
-
-	ret = write_all_supers(root, max_mirrors);
-	return ret;
+	return write_all_supers(root, max_mirrors);
 }
 
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				  struct btrfs_root *root)
 {
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	radix_tree_delete(&fs_info->fs_roots_radix,
@@ -2370,108 +3516,107 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	if (btrfs_root_refs(&root->root_item) == 0)
 		synchronize_srcu(&fs_info->subvol_srcu);
 
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		btrfs_free_log(NULL, root);
+
+	if (root->free_ino_pinned)
+		__btrfs_remove_free_space_cache(root->free_ino_pinned);
+	if (root->free_ino_ctl)
+		__btrfs_remove_free_space_cache(root->free_ino_ctl);
 	free_fs_root(root);
-	return 0;
 }
 
 static void free_fs_root(struct btrfs_root *root)
 {
+	iput(root->cache_inode);
 	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
-	if (root->anon_super.s_dev) {
-		down_write(&root->anon_super.s_umount);
-		kill_anon_super(&root->anon_super);
-	}
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
+	if (root->anon_dev)
+		free_anon_bdev(root->anon_dev);
+	if (root->subv_writers)
+		btrfs_free_subvolume_writers(root->subv_writers);
 	free_extent_buffer(root->node);
 	free_extent_buffer(root->commit_root);
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
 	kfree(root->name);
-	kfree(root);
+	btrfs_put_fs_root(root);
 }
 
-static int del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_root(struct btrfs_root *root)
 {
-	int ret;
-	struct btrfs_root *gang[8];
-	int i;
-
-	while (!list_empty(&fs_info->dead_roots)) {
-		gang[0] = list_entry(fs_info->dead_roots.next,
-				     struct btrfs_root, root_list);
-		list_del(&gang[0]->root_list);
-
-		if (gang[0]->in_radix) {
-			btrfs_free_fs_root(fs_info, gang[0]);
-		} else {
-			free_extent_buffer(gang[0]->node);
-			free_extent_buffer(gang[0]->commit_root);
-			kfree(gang[0]);
-		}
-	}
-
-	while (1) {
-		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-					     (void **)gang, 0,
-					     ARRAY_SIZE(gang));
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++)
-			btrfs_free_fs_root(fs_info, gang[i]);
-	}
-	return 0;
+	free_fs_root(root);
 }
 
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 {
 	u64 root_objectid = 0;
 	struct btrfs_root *gang[8];
-	int i;
-	int ret;
+	int i = 0;
+	int err = 0;
+	unsigned int ret = 0;
+	int index;
 
 	while (1) {
+		index = srcu_read_lock(&fs_info->subvol_srcu);
 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
 					     (void **)gang, root_objectid,
 					     ARRAY_SIZE(gang));
-		if (!ret)
+		if (!ret) {
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
 			break;
-
+		}
 		root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
 		for (i = 0; i < ret; i++) {
+			/* Avoid to grab roots in dead_roots */
+			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+				gang[i] = NULL;
+				continue;
+			}
+			/* grab all the search result for later use */
+			gang[i] = btrfs_grab_fs_root(gang[i]);
+		}
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+		for (i = 0; i < ret; i++) {
+			if (!gang[i])
+				continue;
 			root_objectid = gang[i]->root_key.objectid;
-			btrfs_orphan_cleanup(gang[i]);
+			err = btrfs_orphan_cleanup(gang[i]);
+			if (err)
+				break;
+			btrfs_put_fs_root(gang[i]);
 		}
 		root_objectid++;
 	}
-	return 0;
+
+	/* release the uncleaned roots due to error */
+	for (; i < ret; i++) {
+		if (gang[i])
+			btrfs_put_fs_root(gang[i]);
+	}
+	return err;
 }
 
 int btrfs_commit_super(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle *trans;
-	int ret;
 
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
-	btrfs_clean_old_snapshots(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
+	wake_up_process(root->fs_info->cleaner_kthread);
 
 	/* wait until ongoing cleanup work done */
 	down_write(&root->fs_info->cleanup_work_sem);
 	up_write(&root->fs_info->cleanup_work_sem);
 
-	trans = btrfs_join_transaction(root, 1);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
-	/* run commit again to drop the original snapshot */
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	btrfs_commit_transaction(trans, root);
-	ret = btrfs_write_and_wait_transaction(NULL, root);
-	BUG_ON(ret);
-
-	ret = write_ctree_super(NULL, root, 0);
-	return ret;
+	return btrfs_commit_transaction(trans, root);
 }
 
 int close_ctree(struct btrfs_root *root)
@@ -2482,221 +3627,197 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
-	btrfs_put_block_group_cache(fs_info);
+	/* wait for the uuid_scan task to finish */
+	down(&fs_info->uuid_tree_rescan_sem);
+	/* avoid complains from lockdep et al., set sem back to initial state */
+	up(&fs_info->uuid_tree_rescan_sem);
+
+	/* pause restriper - we want to resume on mount */
+	btrfs_pause_balance(fs_info);
+
+	btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+	btrfs_scrub_cancel(fs_info);
+
+	/* wait for any defraggers to finish */
+	wait_event(fs_info->transaction_wait,
+		   (atomic_read(&fs_info->defrag_running) == 0));
+
+	/* clear out the rbtree of defraggable inodes */
+	btrfs_cleanup_defrag_inodes(fs_info);
+
+	cancel_work_sync(&fs_info->async_reclaim_work);
 
-	/*
-	 * Here come 2 situations when btrfs is broken to flip readonly:
-	 *
-	 * 1. when btrfs flips readonly somewhere else before
-	 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
-	 * and btrfs will skip to write sb directly to keep
-	 * ERROR state on disk.
-	 *
-	 * 2. when btrfs flips readonly just in btrfs_commit_super,
-	 * and in such case, btrfs cannot write sb via btrfs_commit_super,
-	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
-	 * btrfs will cleanup all FS resources first and write sb then.
-	 */
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_commit_super(root);
 		if (ret)
-			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+			btrfs_err(root->fs_info, "commit super ret %d", ret);
 	}
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-		ret = btrfs_error_commit_super(root);
-		if (ret)
-			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
-	}
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		btrfs_error_commit_super(root);
 
-	kthread_stop(root->fs_info->transaction_kthread);
-	kthread_stop(root->fs_info->cleaner_kthread);
+	kthread_stop(fs_info->transaction_kthread);
+	kthread_stop(fs_info->cleaner_kthread);
 
 	fs_info->closing = 2;
 	smp_mb();
 
-	if (fs_info->delalloc_bytes) {
-		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-		       (unsigned long long)fs_info->delalloc_bytes);
-	}
-	if (fs_info->total_ref_cache_size) {
-		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-		       (unsigned long long)fs_info->total_ref_cache_size);
+	btrfs_free_qgroup_config(root->fs_info);
+
+	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+		btrfs_info(root->fs_info, "at unmount delalloc count %lld",
+		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	free_extent_buffer(fs_info->extent_root->node);
-	free_extent_buffer(fs_info->extent_root->commit_root);
-	free_extent_buffer(fs_info->tree_root->node);
-	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	free_extent_buffer(root->fs_info->dev_root->node);
-	free_extent_buffer(root->fs_info->dev_root->commit_root);
-	free_extent_buffer(root->fs_info->csum_root->node);
-	free_extent_buffer(root->fs_info->csum_root->commit_root);
+	btrfs_sysfs_remove_one(fs_info);
+
+	btrfs_free_fs_roots(fs_info);
+
+	btrfs_put_block_group_cache(fs_info);
 
-	btrfs_free_block_groups(root->fs_info);
+	btrfs_free_block_groups(fs_info);
 
-	del_fs_roots(fs_info);
+	/*
+	 * we must make sure there is not any read request to
+	 * submit after we stopping all workers.
+	 */
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	btrfs_stop_all_workers(fs_info);
+
+	free_root_pointers(fs_info, 1);
 
 	iput(fs_info->btree_inode);
 
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		btrfsic_unmount(root, fs_info->fs_devices);
+#endif
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
+	percpu_counter_destroy(&fs_info->bio_counter);
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	kfree(fs_info->extent_root);
-	kfree(fs_info->tree_root);
-	kfree(fs_info->chunk_root);
-	kfree(fs_info->dev_root);
-	kfree(fs_info->csum_root);
-	kfree(fs_info);
+	btrfs_free_stripe_hash_table(fs_info);
+
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
 
 	return 0;
 }
 
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+			  int atomic)
 {
 	int ret;
-	struct inode *btree_inode = buf->first_page->mapping->host;
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
 
-	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
-				     NULL);
+	ret = extent_buffer_uptodate(buf);
 	if (!ret)
 		return ret;
 
 	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
-				    parent_transid);
+				    parent_transid, atomic);
+	if (ret == -EAGAIN)
+		return ret;
 	return !ret;
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->first_page->mapping->host;
-	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
-					  buf);
+	return set_extent_buffer_uptodate(buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	struct btrfs_root *root;
 	u64 transid = btrfs_header_generation(buf);
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	int was_dirty;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	/*
+	 * This is a fast path so only do this check if we have sanity tests
+	 * enabled.  Normal people shouldn't be marking dummy buffers as dirty
+	 * outside of the sanity tests.
+	 */
+	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+		return;
+#endif
+	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
 	btrfs_assert_tree_locked(buf);
-	if (transid != root->fs_info->generation) {
-		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+	if (transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
-			(unsigned long long)buf->start,
-			(unsigned long long)transid,
-			(unsigned long long)root->fs_info->generation);
-		WARN_ON(1);
-	}
-	was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-					    buf);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += buf->len;
-		spin_unlock(&root->fs_info->delalloc_lock);
+			buf->start, transid, root->fs_info->generation);
+	was_dirty = set_extent_buffer_dirty(buf);
+	if (!was_dirty)
+		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+				     buf->len,
+				     root->fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+		btrfs_print_leaf(root, buf);
+		ASSERT(0);
 	}
+#endif
 }
 
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+					int flush_delayed)
 {
 	/*
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
-	u64 num_dirty;
-	unsigned long thresh = 32 * 1024 * 1024;
+	int ret;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
 
-	num_dirty = root->fs_info->dirty_metadata_bytes;
+	if (flush_delayed)
+		btrfs_balance_delayed_items(root);
 
-	if (num_dirty > thresh) {
-		balance_dirty_pages_ratelimited_nr(
-				   root->fs_info->btree_inode->i_mapping, 1);
+	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+				     BTRFS_DIRTY_METADATA_THRESH);
+	if (ret > 0) {
+		balance_dirty_pages_ratelimited(
+				   root->fs_info->btree_inode->i_mapping);
 	}
 	return;
 }
 
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
 {
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	int ret;
-	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-	if (ret == 0)
-		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
-	return ret;
+	__btrfs_btree_balance_dirty(root, 1);
 }
 
-int btree_lock_page_hook(struct page *page)
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
 {
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_buffer *eb;
-	unsigned long len;
-	u64 bytenr = page_offset(page);
-
-	if (page->private == EXTENT_PAGE_PRIVATE)
-		goto out;
-
-	len = page->private >> 2;
-	eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
-	if (!eb)
-		goto out;
-
-	btrfs_tree_lock(eb);
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		if (root->fs_info->dirty_metadata_bytes >= eb->len)
-			root->fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
+	__btrfs_btree_balance_dirty(root, 0);
+}
 
-	btrfs_tree_unlock(eb);
-	free_extent_buffer(eb);
-out:
-	lock_page(page);
-	return 0;
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
-	if (read_only)
-		return;
-
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
-		printk(KERN_WARNING "warning: mount fs with errors, "
-		       "running btrfsck is recommended\n");
+	/*
+	 * Placeholder for checks
+	 */
+	return 0;
 }
 
-int btrfs_error_commit_super(struct btrfs_root *root)
+static void btrfs_error_commit_super(struct btrfs_root *root)
 {
-	int ret;
-
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2706,13 +3827,10 @@ int btrfs_error_commit_super(struct btrfs_root *root)
 
 	/* cleanup FS via transaction */
 	btrfs_cleanup_transaction(root);
-
-	ret = write_ctree_super(NULL, root, 0);
-
-	return ret;
 }
 
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+					     struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct list_head splice;
@@ -2720,58 +3838,62 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
 	INIT_LIST_HEAD(&splice);
 
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_root_lock);
 
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
+	list_splice_init(&t->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 					 ordered_operations);
 
 		list_del_init(&btrfs_inode->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 
 		btrfs_invalidate_inodes(btrfs_inode->root);
+
+		spin_lock(&root->fs_info->ordered_root_lock);
 	}
 
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+}
 
-	return 0;
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&root->ordered_extent_lock);
+	/*
+	 * This will just short circuit the ordered completion stuff which will
+	 * make sure the ordered extent gets properly cleaned up.
+	 */
+	list_for_each_entry(ordered, &root->ordered_extents,
+			    root_extent_list)
+		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+	spin_unlock(&root->ordered_extent_lock);
 }
 
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_root *root;
 	struct list_head splice;
-	struct btrfs_ordered_extent *ordered;
-	struct inode *inode;
 
 	INIT_LIST_HEAD(&splice);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
-
-	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
 	while (!list_empty(&splice)) {
-		ordered = list_entry(splice.next, struct btrfs_ordered_extent,
-				     root_extent_list);
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
 
-		list_del_init(&ordered->root_extent_list);
-		atomic_inc(&ordered->refs);
-
-		/* the inode may be getting freed (in sys_unlink path). */
-		inode = igrab(ordered->inode);
-
-		spin_unlock(&root->fs_info->ordered_extent_lock);
-		if (inode)
-			iput(inode);
-
-		atomic_set(&ordered->refs, 1);
-		btrfs_put_ordered_extent(ordered);
+		spin_unlock(&fs_info->ordered_root_lock);
+		btrfs_destroy_ordered_extents(root);
 
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		cond_resched();
+		spin_lock(&fs_info->ordered_root_lock);
 	}
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-
-	return 0;
+	spin_unlock(&fs_info->ordered_root_lock);
 }
 
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -2785,37 +3907,54 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	delayed_refs = &trans->delayed_refs;
 
 	spin_lock(&delayed_refs->lock);
-	if (delayed_refs->num_entries == 0) {
-		printk(KERN_INFO "delayed_refs has NO entry\n");
+	if (atomic_read(&delayed_refs->num_entries) == 0) {
+		spin_unlock(&delayed_refs->lock);
+		btrfs_info(root->fs_info, "delayed_refs has NO entry");
 		return ret;
 	}
 
-	node = rb_first(&delayed_refs->root);
-	while (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_next(node);
+	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+		struct btrfs_delayed_ref_head *head;
+		bool pin_bytes = false;
 
-		ref->in_tree = 0;
-		rb_erase(&ref->rb_node, &delayed_refs->root);
-		delayed_refs->num_entries--;
+		head = rb_entry(node, struct btrfs_delayed_ref_head,
+				href_node);
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
 
-		atomic_set(&ref->refs, 1);
-		if (btrfs_delayed_ref_is_head(ref)) {
-			struct btrfs_delayed_ref_head *head;
-
-			head = btrfs_delayed_node_to_head(ref);
 			mutex_lock(&head->mutex);
-			kfree(head->extent_op);
-			delayed_refs->num_heads--;
-			if (list_empty(&head->cluster))
-				delayed_refs->num_heads_ready--;
-			list_del_init(&head->cluster);
 			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			spin_lock(&delayed_refs->lock);
+			continue;
 		}
-
+		spin_lock(&head->lock);
+		while ((node = rb_first(&head->ref_root)) != NULL) {
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			ref->in_tree = 0;
+			rb_erase(&ref->rb_node, &head->ref_root);
+			atomic_dec(&delayed_refs->num_entries);
+			btrfs_put_delayed_ref(ref);
+		}
+		if (head->must_insert_reserved)
+			pin_bytes = true;
+		btrfs_free_delayed_extent_op(head->extent_op);
+		delayed_refs->num_heads--;
+		if (head->processing == 0)
+			delayed_refs->num_heads_ready--;
+		atomic_dec(&delayed_refs->num_entries);
+		head->node.in_tree = 0;
+		rb_erase(&head->href_node, &delayed_refs->href_root);
+		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
-		btrfs_put_delayed_ref(ref);
+		mutex_unlock(&head->mutex);
 
+		if (pin_bytes)
+			btrfs_pin_extent(root, head->node.bytenr,
+					 head->node.num_bytes, 1);
+		btrfs_put_delayed_ref(&head->node);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -2825,51 +3964,56 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	return ret;
 }
 
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 {
-	struct btrfs_pending_snapshot *snapshot;
+	struct btrfs_inode *btrfs_inode;
 	struct list_head splice;
 
 	INIT_LIST_HEAD(&splice);
 
-	list_splice_init(&t->pending_snapshots, &splice);
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
 
 	while (!list_empty(&splice)) {
-		snapshot = list_entry(splice.next,
-				      struct btrfs_pending_snapshot,
-				      list);
+		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+					       delalloc_inodes);
 
-		list_del_init(&snapshot->list);
+		list_del_init(&btrfs_inode->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &btrfs_inode->runtime_flags);
+		spin_unlock(&root->delalloc_lock);
+
+		btrfs_invalidate_inodes(btrfs_inode->root);
 
-		kfree(snapshot);
+		spin_lock(&root->delalloc_lock);
 	}
 
-	return 0;
+	spin_unlock(&root->delalloc_lock);
 }
 
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_inode *btrfs_inode;
+	struct btrfs_root *root;
 	struct list_head splice;
 
 	INIT_LIST_HEAD(&splice);
 
-	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
-
-	spin_lock(&root->fs_info->delalloc_lock);
-
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
 	while (!list_empty(&splice)) {
-		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-				    delalloc_inodes);
+		root = list_first_entry(&splice, struct btrfs_root,
+					 delalloc_root);
+		list_del_init(&root->delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		spin_unlock(&fs_info->delalloc_root_lock);
 
-		list_del_init(&btrfs_inode->delalloc_inodes);
+		btrfs_destroy_delalloc_inodes(root);
+		btrfs_put_fs_root(root);
 
-		btrfs_invalidate_inodes(btrfs_inode->root);
+		spin_lock(&fs_info->delalloc_root_lock);
 	}
-
-	spin_unlock(&root->fs_info->delalloc_lock);
-
-	return 0;
+	spin_unlock(&fs_info->delalloc_root_lock);
 }
 
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -2877,54 +4021,29 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					int mark)
 {
 	int ret;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
 	u64 start = 0;
 	u64 end;
-	u64 offset;
-	unsigned long index;
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
+					    mark, NULL);
 		if (ret)
 			break;
 
 		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
 		while (start <= end) {
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
+			eb = btrfs_find_tree_block(root, start,
+						   root->leafsize);
+			start += root->leafsize;
+			if (!eb)
 				continue;
-			offset = page_offset(page);
-
-			spin_lock(&dirty_pages->buffer_lock);
-			eb = radix_tree_lookup(
-			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
-					       offset >> PAGE_CACHE_SHIFT);
-			spin_unlock(&dirty_pages->buffer_lock);
-			if (eb) {
-				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
-							 &eb->bflags);
-				atomic_set(&eb->refs, 1);
-			}
-			if (PageWriteback(page))
-				end_page_writeback(page);
-
-			lock_page(page);
-			if (PageDirty(page)) {
-				clear_page_dirty_for_io(page);
-				spin_lock_irq(&page->mapping->tree_lock);
-				radix_tree_tag_clear(&page->mapping->page_tree,
-							page_index(page),
-							PAGECACHE_TAG_DIRTY);
-				spin_unlock_irq(&page->mapping->tree_lock);
-			}
+			wait_on_extent_buffer_writeback(eb);
 
-			page->mapping->a_ops->invalidatepage(page, 0);
-			unlock_page(page);
+			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+					       &eb->bflags))
+				clear_extent_buffer_dirty(eb);
+			free_extent_buffer_stale(eb);
 		}
 	}
 
@@ -2938,99 +4057,125 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 	u64 start;
 	u64 end;
 	int ret;
+	bool loop = true;
 
 	unpin = pinned_extents;
+again:
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
 		/* opt_discard */
-		ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_error_discard_extent(root, start,
+							 end + 1 - start,
+							 NULL);
 
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		btrfs_error_unpin_extent_range(root, start, end);
 		cond_resched();
 	}
 
+	if (loop) {
+		if (unpin == &root->fs_info->freed_extents[0])
+			unpin = &root->fs_info->freed_extents[1];
+		else
+			unpin = &root->fs_info->freed_extents[0];
+		loop = false;
+		goto again;
+	}
+
 	return 0;
 }
 
-static int btrfs_cleanup_transaction(struct btrfs_root *root)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+				   struct btrfs_root *root)
 {
-	struct btrfs_transaction *t;
-	LIST_HEAD(list);
-
-	WARN_ON(1);
-
-	mutex_lock(&root->fs_info->trans_mutex);
-	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+	btrfs_destroy_ordered_operations(cur_trans, root);
 
-	list_splice_init(&root->fs_info->trans_list, &list);
-	while (!list_empty(&list)) {
-		t = list_entry(list.next, struct btrfs_transaction, list);
-		if (!t)
-			break;
-
-		btrfs_destroy_ordered_operations(root);
-
-		btrfs_destroy_ordered_extents(root);
+	btrfs_destroy_delayed_refs(cur_trans, root);
 
-		btrfs_destroy_delayed_refs(t, root);
+	cur_trans->state = TRANS_STATE_COMMIT_START;
+	wake_up(&root->fs_info->transaction_blocked_wait);
 
-		btrfs_block_rsv_release(root,
-					&root->fs_info->trans_block_rsv,
-					t->dirty_pages.dirty_bytes);
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	wake_up(&root->fs_info->transaction_wait);
 
-		/* FIXME: cleanup wait for commit */
-		t->in_commit = 1;
-		t->blocked = 1;
-		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
-			wake_up(&root->fs_info->transaction_blocked_wait);
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
 
-		t->blocked = 0;
-		if (waitqueue_active(&root->fs_info->transaction_wait))
-			wake_up(&root->fs_info->transaction_wait);
-		mutex_unlock(&root->fs_info->trans_mutex);
+	btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+				     EXTENT_DIRTY);
+	btrfs_destroy_pinned_extent(root,
+				    root->fs_info->pinned_extents);
 
-		mutex_lock(&root->fs_info->trans_mutex);
-		t->commit_done = 1;
-		if (waitqueue_active(&t->commit_wait))
-			wake_up(&t->commit_wait);
-		mutex_unlock(&root->fs_info->trans_mutex);
+	cur_trans->state =TRANS_STATE_COMPLETED;
+	wake_up(&cur_trans->commit_wait);
 
-		mutex_lock(&root->fs_info->trans_mutex);
-
-		btrfs_destroy_pending_snapshots(t);
-
-		btrfs_destroy_delalloc_inodes(root);
+	/*
+	memset(cur_trans, 0, sizeof(*cur_trans));
+	kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+	*/
+}
 
-		spin_lock(&root->fs_info->new_trans_lock);
-		root->fs_info->running_transaction = NULL;
-		spin_unlock(&root->fs_info->new_trans_lock);
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *t;
 
-		btrfs_destroy_marked_extents(root, &t->dirty_pages,
-					     EXTENT_DIRTY);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		btrfs_destroy_pinned_extent(root,
-					    root->fs_info->pinned_extents);
+	spin_lock(&root->fs_info->trans_lock);
+	while (!list_empty(&root->fs_info->trans_list)) {
+		t = list_first_entry(&root->fs_info->trans_list,
+				     struct btrfs_transaction, list);
+		if (t->state >= TRANS_STATE_COMMIT_START) {
+			atomic_inc(&t->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
+			btrfs_wait_for_commit(root, t->transid);
+			btrfs_put_transaction(t);
+			spin_lock(&root->fs_info->trans_lock);
+			continue;
+		}
+		if (t == root->fs_info->running_transaction) {
+			t->state = TRANS_STATE_COMMIT_DOING;
+			spin_unlock(&root->fs_info->trans_lock);
+			/*
+			 * We wait for 0 num_writers since we don't hold a trans
+			 * handle open currently for this transaction.
+			 */
+			wait_event(t->writer_wait,
+				   atomic_read(&t->num_writers) == 0);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
+		}
+		btrfs_cleanup_one_transaction(t, root);
 
-		t->use_count = 0;
+		spin_lock(&root->fs_info->trans_lock);
+		if (t == root->fs_info->running_transaction)
+			root->fs_info->running_transaction = NULL;
 		list_del_init(&t->list);
-		memset(t, 0, sizeof(*t));
-		kmem_cache_free(btrfs_transaction_cachep, t);
-	}
+		spin_unlock(&root->fs_info->trans_lock);
 
+		btrfs_put_transaction(t);
+		trace_btrfs_transaction_commit(root);
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+	btrfs_destroy_all_ordered_extents(root->fs_info);
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
+	btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+	btrfs_destroy_all_delalloc_inodes(root->fs_info);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	return 0;
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 07b20dc2fd9..23ce3ceba0a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
+enum {
+	BTRFS_WQ_ENDIO_DATA = 0,
+	BTRFS_WQ_ENDIO_METADATA = 1,
+	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+	BTRFS_WQ_ENDIO_RAID56 = 3,
+};
+
 static inline u64 btrfs_sb_offset(int mirror)
 {
 	u64 start = 16 * 1024;
@@ -40,50 +47,77 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 			 u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
-int clean_tree_block(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-					u64 root_objectid);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-				      struct btrfs_key *location,
-				      const char *name, int namelen);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-					       struct btrfs_key *location);
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
-					      struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *key,
+				     bool check_ref);
+static inline struct btrfs_root *
+btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+			   struct btrfs_key *location)
+{
+	return btrfs_get_fs_root(fs_info, location, true);
+}
+
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-int btrfs_insert_dev_radix(struct btrfs_root *root,
-			   struct block_device *bdev,
-			   u64 device_id,
-			   u64 block_start,
-			   u64 num_blocks);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				 struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * 	fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+	if (atomic_inc_not_zero(&root->refs))
+		return root;
+	return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+	if (atomic_dec_and_test(&root->refs))
+		kfree(root);
+}
+
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+			  int atomic);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
-int wait_on_tree_block_writeback(struct btrfs_root *root,
-				 struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+u32 btrfs_csum_data(char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
-int btrfs_open_device(struct btrfs_device *dev);
-int btrfs_verify_block_csum(struct btrfs_root *root,
-			    struct extent_buffer *buf);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
@@ -91,8 +125,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			unsigned long bio_flags, u64 bio_offset,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done);
-
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
@@ -100,14 +132,25 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+				  struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+void btrfs_init_lockdep(void);
+void btrfs_set_buffer_lockdep_class(u64 objectid,
+			            struct extent_buffer *eb, int level);
 #else
-static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
-						 int level)
+static inline void btrfs_init_lockdep(void)
+{ }
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+					struct extent_buffer *eb, int level)
 {
 }
 #endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index b4ffad859ad..41422a3de8e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
 #include "btrfs_inode.h"
 #include "print-tree.h"
 #include "export.h"
-#include "compat.h"
 
 #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
 						 parent_objectid) / 4)
@@ -13,42 +12,35 @@
 					     parent_root_objectid) / 4)
 #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
 
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
-			   int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
 {
 	struct btrfs_fid *fid = (struct btrfs_fid *)fh;
-	struct inode *inode = dentry->d_inode;
 	int len = *max_len;
 	int type;
 
-	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+	if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
 		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
-		return 255;
+		return FILEID_INVALID;
 	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
 		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
 	type = FILEID_BTRFS_WITHOUT_PARENT;
 
-	fid->objectid = inode->i_ino;
+	fid->objectid = btrfs_ino(inode);
 	fid->root_objectid = BTRFS_I(inode)->root->objectid;
 	fid->gen = inode->i_generation;
 
-	if (connectable && !S_ISDIR(inode->i_mode)) {
-		struct inode *parent;
+	if (parent) {
 		u64 parent_root_id;
 
-		spin_lock(&dentry->d_lock);
-
-		parent = dentry->d_parent->d_inode;
 		fid->parent_objectid = BTRFS_I(parent)->location.objectid;
 		fid->parent_gen = parent->i_generation;
 		parent_root_id = BTRFS_I(parent)->root->objectid;
 
-		spin_unlock(&dentry->d_lock);
-
 		if (parent_root_id != fid->root_objectid) {
 			fid->parent_root_objectid = parent_root_id;
 			len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
@@ -67,7 +59,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 				       u64 root_objectid, u32 generation,
 				       int check_generation)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root;
 	struct inode *inode;
 	struct btrfs_key key;
@@ -89,11 +81,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 		goto fail;
 	}
 
-	if (btrfs_root_refs(&root->root_item) == 0) {
-		err = -ENOENT;
-		goto fail;
-	}
-
 	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
@@ -178,13 +165,13 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	if (!path)
 		return ERR_PTR(-ENOMEM);
 
-	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+	if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
 		key.objectid = root->root_key.objectid;
 		key.type = BTRFS_ROOT_BACKREF_KEY;
 		key.offset = (u64)-1;
 		root = root->fs_info->tree_root;
 	} else {
-		key.objectid = dir->i_ino;
+		key.objectid = btrfs_ino(dir);
 		key.type = BTRFS_INODE_REF_KEY;
 		key.offset = (u64)-1;
 	}
@@ -193,7 +180,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	if (ret < 0)
 		goto fail;
 
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Key with offset of -1 found */
 	if (path->slots[0] == 0) {
 		ret = -ENOENT;
 		goto fail;
@@ -244,6 +231,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
 	struct btrfs_key key;
 	int name_len;
 	int ret;
+	u64 ino;
 
 	if (!dir || !inode)
 		return -EINVAL;
@@ -251,19 +239,21 @@ static int btrfs_get_name(struct dentry *parent, char *name,
 	if (!S_ISDIR(dir->i_mode))
 		return -EINVAL;
 
+	ino = btrfs_ino(inode);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
-	if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+	if (ino == BTRFS_FIRST_FREE_OBJECTID) {
 		key.objectid = BTRFS_I(inode)->root->root_key.objectid;
 		key.type = BTRFS_ROOT_BACKREF_KEY;
 		key.offset = (u64)-1;
 		root = root->fs_info->tree_root;
 	} else {
-		key.objectid = inode->i_ino;
-		key.offset = dir->i_ino;
+		key.objectid = ino;
+		key.offset = btrfs_ino(dir);
 		key.type = BTRFS_INODE_REF_KEY;
 	}
 
@@ -272,7 +262,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
 		btrfs_free_path(path);
 		return ret;
 	} else if (ret > 0) {
-		if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+		if (ino == BTRFS_FIRST_FREE_OBJECTID) {
 			path->slots[0]--;
 		} else {
 			btrfs_free_path(path);
@@ -281,11 +271,11 @@ static int btrfs_get_name(struct dentry *parent, char *name,
 	}
 	leaf = path->nodes[0];
 
-	if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
-	       rref = btrfs_item_ptr(leaf, path->slots[0],
+	if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+		rref = btrfs_item_ptr(leaf, path->slots[0],
 				     struct btrfs_root_ref);
-	       name_ptr = (unsigned long)(rref + 1);
-	       name_len = btrfs_root_ref_name_len(leaf, rref);
+		name_ptr = (unsigned long)(rref + 1);
+		name_len = btrfs_root_ref_name_len(leaf, rref);
 	} else {
 		iref = btrfs_item_ptr(leaf, path->slots[0],
 				      struct btrfs_inode_ref);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b3089b5c2d..813537f362f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,27 +23,66 @@
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
-#include "compat.h"
+#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
 #include "hash.h"
-#include "ctree.h"
+#include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
-#include "transaction.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
+#include "sysfs.h"
+#include "qgroup.h"
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
+#undef SCRAMBLE_DELAYED_REFS
+
+/*
+ * control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ */
+enum {
+	CHUNK_ALLOC_NO_FORCE = 0,
+	CHUNK_ALLOC_LIMITED = 1,
+	CHUNK_ALLOC_FORCE = 2,
+};
+
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+	RESERVE_FREE = 0,
+	RESERVE_ALLOC = 1,
+	RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
+static int update_block_group(struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc);
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				 u64 num_bytes, int reserve, int sinfo);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
 				u64 root_objectid, u64 owner_objectid,
 				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extra_op);
+				struct btrfs_delayed_extent_op *extra_op,
+				int no_quota);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 				    struct extent_buffer *leaf,
 				    struct btrfs_extent_item *ei);
@@ -56,20 +95,29 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins);
+				     int level, struct btrfs_key *ins,
+				     int no_quota);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force);
+			  struct btrfs_root *extent_root, u64 flags,
+			  int force);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve,
+				       int delalloc);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	smp_mb();
-	return cache->cached == BTRFS_CACHE_FINISHED;
+	return cache->cached == BTRFS_CACHE_FINISHED ||
+		cache->cached == BTRFS_CACHE_ERROR;
 }
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
@@ -77,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 	return (cache->flags & bits) == bits;
 }
 
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 {
 	atomic_inc(&cache->count);
 }
@@ -87,7 +135,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
-		WARN_ON(cache->reserved_pinned > 0);
+		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
 }
@@ -123,6 +171,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 	rb_link_node(&block_group->cache_node, parent, p);
 	rb_insert_color(&block_group->cache_node,
 			&info->block_group_cache_tree);
+
+	if (info->first_logical_byte > block_group->key.objectid)
+		info->first_logical_byte = block_group->key.objectid;
+
 	spin_unlock(&info->block_group_cache_lock);
 
 	return 0;
@@ -164,8 +216,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 			break;
 		}
 	}
-	if (ret)
+	if (ret) {
 		btrfs_get_block_group(ret);
+		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+			info->first_logical_byte = ret->key.objectid;
+	}
 	spin_unlock(&info->block_group_cache_lock);
 
 	return ret;
@@ -209,7 +264,8 @@ static int exclude_super_stripes(struct btrfs_root *root,
 		cache->bytes_super += stripe_len;
 		ret = add_excluded_extent(root, cache->key.objectid,
 					  stripe_len);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -217,13 +273,35 @@ static int exclude_super_stripes(struct btrfs_root *root,
 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 				       cache->key.objectid, bytenr,
 				       0, &logical, &nr, &stripe_len);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 
 		while (nr--) {
-			cache->bytes_super += stripe_len;
-			ret = add_excluded_extent(root, logical[nr],
-						  stripe_len);
-			BUG_ON(ret);
+			u64 start, len;
+
+			if (logical[nr] > cache->key.objectid +
+			    cache->key.offset)
+				continue;
+
+			if (logical[nr] + stripe_len <= cache->key.objectid)
+				continue;
+
+			start = logical[nr];
+			if (start < cache->key.objectid) {
+				start = cache->key.objectid;
+				len = (logical[nr] + stripe_len) - start;
+			} else {
+				len = min_t(u64, stripe_len,
+					    cache->key.objectid +
+					    cache->key.offset - start);
+			}
+
+			cache->bytes_super += len;
+			ret = add_excluded_extent(root, start, len);
+			if (ret) {
+				kfree(logical);
+				return ret;
+			}
 		}
 
 		kfree(logical);
@@ -274,7 +352,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	while (start < end) {
 		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY | EXTENT_UPTODATE);
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
 		if (ret)
 			break;
 
@@ -285,7 +364,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 			total_added += size;
 			ret = btrfs_add_free_space(block_group, start,
 						   size);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM or logic error */
 			start = extent_end + 1;
 		} else {
 			break;
@@ -296,29 +375,34 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 		size = end - start;
 		total_added += size;
 		ret = btrfs_add_free_space(block_group, start, size);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM or logic error */
 	}
 
 	return total_added;
 }
 
-static int caching_kthread(void *data)
+static noinline void caching_thread(struct btrfs_work *work)
 {
-	struct btrfs_block_group_cache *block_group = data;
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct btrfs_root *extent_root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	u64 total_found = 0;
 	u64 last = 0;
 	u32 nritems;
-	int ret = 0;
+	int ret = -ENOMEM;
+
+	caching_ctl = container_of(work, struct btrfs_caching_control, work);
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+	extent_root = fs_info->extent_root;
 
 	path = btrfs_alloc_path();
 	if (!path)
-		return -ENOMEM;
+		goto out;
 
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
@@ -330,7 +414,7 @@ static int caching_kthread(void *data)
 	 */
 	path->skip_locking = 1;
 	path->search_commit_root = 1;
-	path->reada = 2;
+	path->reada = 1;
 
 	key.objectid = last;
 	key.offset = 0;
@@ -338,8 +422,9 @@ static int caching_kthread(void *data)
 again:
 	mutex_lock(&caching_ctl->mutex);
 	/* need to make sure the commit_root doesn't disappear */
-	down_read(&fs_info->extent_commit_sem);
+	down_read(&fs_info->commit_root_sem);
 
+next:
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -348,8 +433,7 @@ again:
 	nritems = btrfs_header_nritems(leaf);
 
 	while (1) {
-		smp_mb();
-		if (fs_info->closing > 1) {
+		if (btrfs_fs_closing(fs_info) > 1) {
 			last = (u64)-1;
 			break;
 		}
@@ -361,15 +445,34 @@ again:
 			if (ret)
 				break;
 
-			caching_ctl->progress = last;
-			btrfs_release_path(extent_root, path);
-			up_read(&fs_info->extent_commit_sem);
-			mutex_unlock(&caching_ctl->mutex);
-			if (btrfs_transaction_in_commit(fs_info))
-				schedule_timeout(1);
-			else
+			if (need_resched() ||
+			    rwsem_is_contended(&fs_info->commit_root_sem)) {
+				caching_ctl->progress = last;
+				btrfs_release_path(path);
+				up_read(&fs_info->commit_root_sem);
+				mutex_unlock(&caching_ctl->mutex);
 				cond_resched();
-			goto again;
+				goto again;
+			}
+
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto err;
+			if (ret)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
+		}
+
+		if (key.objectid < last) {
+			key.objectid = last;
+			key.offset = 0;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+
+			caching_ctl->progress = last;
+			btrfs_release_path(path);
+			goto next;
 		}
 
 		if (key.objectid < block_group->key.objectid) {
@@ -381,11 +484,16 @@ again:
 		    block_group->key.offset)
 			break;
 
-		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    key.type == BTRFS_METADATA_ITEM_KEY) {
 			total_found += add_new_free_space(block_group,
 							  fs_info, last,
 							  key.objectid);
-			last = key.objectid + key.offset;
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				last = key.objectid +
+					fs_info->tree_root->leafsize;
+			else
+				last = key.objectid + key.offset;
 
 			if (total_found > (1024 * 1024 * 2)) {
 				total_found = 0;
@@ -408,104 +516,134 @@ again:
 
 err:
 	btrfs_free_path(path);
-	up_read(&fs_info->extent_commit_sem);
+	up_read(&fs_info->commit_root_sem);
 
 	free_excluded_extents(extent_root, block_group);
 
 	mutex_unlock(&caching_ctl->mutex);
+out:
+	if (ret) {
+		spin_lock(&block_group->lock);
+		block_group->caching_ctl = NULL;
+		block_group->cached = BTRFS_CACHE_ERROR;
+		spin_unlock(&block_group->lock);
+	}
 	wake_up(&caching_ctl->wait);
 
 	put_caching_control(caching_ctl);
-	atomic_dec(&block_group->space_info->caching_threads);
 	btrfs_put_block_group(block_group);
-
-	return 0;
 }
 
 static int cache_block_group(struct btrfs_block_group_cache *cache,
-			     struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
 			     int load_cache_only)
 {
+	DEFINE_WAIT(wait);
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_caching_control *caching_ctl;
-	struct task_struct *tsk;
 	int ret = 0;
 
-	smp_mb();
-	if (cache->cached != BTRFS_CACHE_NO)
-		return 0;
+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+	if (!caching_ctl)
+		return -ENOMEM;
 
+	INIT_LIST_HEAD(&caching_ctl->list);
+	mutex_init(&caching_ctl->mutex);
+	init_waitqueue_head(&caching_ctl->wait);
+	caching_ctl->block_group = cache;
+	caching_ctl->progress = cache->key.objectid;
+	atomic_set(&caching_ctl->count, 1);
+	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+
+	spin_lock(&cache->lock);
 	/*
-	 * We can't do the read from on-disk cache during a commit since we need
-	 * to have the normal tree locking.  Also if we are currently trying to
-	 * allocate blocks for the tree root we can't do the fast caching since
-	 * we likely hold important locks.
+	 * This should be a rare occasion, but this could happen I think in the
+	 * case where one thread starts to load the space cache info, and then
+	 * some other thread starts a transaction commit which tries to do an
+	 * allocation while the other thread is still loading the space cache
+	 * info.  The previous loop should have kept us from choosing this block
+	 * group, but if we've moved to the state where we will wait on caching
+	 * block groups we need to first check if we're doing a fast load here,
+	 * so we can wait for it to finish, otherwise we could end up allocating
+	 * from a block group who's cache gets evicted for one reason or
+	 * another.
 	 */
-	if (!trans->transaction->in_commit &&
-	    (root && root != root->fs_info->tree_root)) {
+	while (cache->cached == BTRFS_CACHE_FAST) {
+		struct btrfs_caching_control *ctl;
+
+		ctl = cache->caching_ctl;
+		atomic_inc(&ctl->count);
+		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&cache->lock);
+
+		schedule();
+
+		finish_wait(&ctl->wait, &wait);
+		put_caching_control(ctl);
 		spin_lock(&cache->lock);
-		if (cache->cached != BTRFS_CACHE_NO) {
-			spin_unlock(&cache->lock);
-			return 0;
-		}
-		cache->cached = BTRFS_CACHE_STARTED;
+	}
+
+	if (cache->cached != BTRFS_CACHE_NO) {
 		spin_unlock(&cache->lock);
+		kfree(caching_ctl);
+		return 0;
+	}
+	WARN_ON(cache->caching_ctl);
+	cache->caching_ctl = caching_ctl;
+	cache->cached = BTRFS_CACHE_FAST;
+	spin_unlock(&cache->lock);
 
+	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 		ret = load_free_space_cache(fs_info, cache);
 
 		spin_lock(&cache->lock);
 		if (ret == 1) {
+			cache->caching_ctl = NULL;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			cache->last_byte_to_unpin = (u64)-1;
 		} else {
-			cache->cached = BTRFS_CACHE_NO;
+			if (load_cache_only) {
+				cache->caching_ctl = NULL;
+				cache->cached = BTRFS_CACHE_NO;
+			} else {
+				cache->cached = BTRFS_CACHE_STARTED;
+			}
 		}
 		spin_unlock(&cache->lock);
+		wake_up(&caching_ctl->wait);
 		if (ret == 1) {
+			put_caching_control(caching_ctl);
 			free_excluded_extents(fs_info->extent_root, cache);
 			return 0;
 		}
+	} else {
+		/*
+		 * We are not going to do the fast caching, set cached to the
+		 * appropriate value and wakeup any waiters.
+		 */
+		spin_lock(&cache->lock);
+		if (load_cache_only) {
+			cache->caching_ctl = NULL;
+			cache->cached = BTRFS_CACHE_NO;
+		} else {
+			cache->cached = BTRFS_CACHE_STARTED;
+		}
+		spin_unlock(&cache->lock);
+		wake_up(&caching_ctl->wait);
 	}
 
-	if (load_cache_only)
-		return 0;
-
-	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
-	BUG_ON(!caching_ctl);
-
-	INIT_LIST_HEAD(&caching_ctl->list);
-	mutex_init(&caching_ctl->mutex);
-	init_waitqueue_head(&caching_ctl->wait);
-	caching_ctl->block_group = cache;
-	caching_ctl->progress = cache->key.objectid;
-	/* one for caching kthread, one for caching block group list */
-	atomic_set(&caching_ctl->count, 2);
-
-	spin_lock(&cache->lock);
-	if (cache->cached != BTRFS_CACHE_NO) {
-		spin_unlock(&cache->lock);
-		kfree(caching_ctl);
+	if (load_cache_only) {
+		put_caching_control(caching_ctl);
 		return 0;
 	}
-	cache->caching_ctl = caching_ctl;
-	cache->cached = BTRFS_CACHE_STARTED;
-	spin_unlock(&cache->lock);
 
-	down_write(&fs_info->extent_commit_sem);
+	down_write(&fs_info->commit_root_sem);
+	atomic_inc(&caching_ctl->count);
 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
-	up_write(&fs_info->extent_commit_sem);
+	up_write(&fs_info->commit_root_sem);
 
-	atomic_inc(&cache->space_info->caching_threads);
 	btrfs_get_block_group(cache);
 
-	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
-			  cache->key.objectid);
-	if (IS_ERR(tsk)) {
-		ret = PTR_ERR(tsk);
-		printk(KERN_ERR "error running thread %d\n", ret);
-		BUG();
-	}
+	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 
 	return ret;
 }
@@ -543,8 +681,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
 
-	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-		 BTRFS_BLOCK_GROUP_METADATA;
+	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
@@ -572,73 +709,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor == 100)
-		return num;
-	num *= factor;
-	do_div(num, 100);
-	return num;
-}
-
-u64 btrfs_find_block_group(struct btrfs_root *root,
-			   u64 search_start, u64 search_hint, int owner)
-{
-	struct btrfs_block_group_cache *cache;
-	u64 used;
-	u64 last = max(search_hint, search_start);
-	u64 group_start = 0;
-	int full_search = 0;
-	int factor = 9;
-	int wrapped = 0;
-again:
-	while (1) {
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-		if (!cache)
-			break;
-
-		spin_lock(&cache->lock);
-		last = cache->key.objectid + cache->key.offset;
-		used = btrfs_block_group_used(&cache->item);
-
-		if ((full_search || !cache->ro) &&
-		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
-			if (used + cache->pinned + cache->reserved <
-			    div_factor(cache->key.offset, factor)) {
-				group_start = cache->key.objectid;
-				spin_unlock(&cache->lock);
-				btrfs_put_block_group(cache);
-				goto found;
-			}
-		}
-		spin_unlock(&cache->lock);
-		btrfs_put_block_group(cache);
-		cond_resched();
-	}
-	if (!wrapped) {
-		last = search_start;
-		wrapped = 1;
-		goto again;
-	}
-	if (!full_search && factor < 10) {
-		last = search_start;
-		full_search = 1;
-		factor = 10;
-		goto again;
-	}
-found:
-	return group_start;
-}
-
 /* simple helper to search for an existing extent at a given offset */
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 {
@@ -647,18 +717,26 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 	struct btrfs_path *path;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	key.objectid = start;
 	key.offset = len;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	key.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
+	if (ret > 0) {
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == start &&
+		    key.type == BTRFS_METADATA_ITEM_KEY)
+			ret = 0;
+	}
 	btrfs_free_path(path);
 	return ret;
 }
 
 /*
- * helper function to lookup reference count and flags of extent.
+ * helper function to lookup reference count and flags of a tree block.
  *
  * the head node for delayed ref is used to store the sum of all the
  * reference count modifications queued up in the rbtree. the head
@@ -668,7 +746,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  */
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags)
+			     u64 offset, int metadata, u64 *refs, u64 *flags)
 {
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -681,23 +759,57 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 	u64 extent_flags;
 	int ret;
 
+	/*
+	 * If we don't have skinny metadata, don't bother doing anything
+	 * different
+	 */
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
+		offset = root->leafsize;
+		metadata = 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = num_bytes;
 	if (!trans) {
 		path->skip_locking = 1;
 		path->search_commit_root = 1;
 	}
+
+search_again:
+	key.objectid = bytenr;
+	key.offset = offset;
+	if (metadata)
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+
 again:
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 				&key, path, 0, 0);
 	if (ret < 0)
 		goto out_free;
 
+	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == root->leafsize)
+				ret = 0;
+		}
+		if (ret) {
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = root->leafsize;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
 	if (ret == 0) {
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -737,19 +849,25 @@ again:
 			atomic_inc(&head->node.refs);
 			spin_unlock(&delayed_refs->lock);
 
-			btrfs_release_path(root->fs_info->extent_root, path);
+			btrfs_release_path(path);
 
+			/*
+			 * Mutex was contended, block until it's released and try
+			 * again
+			 */
 			mutex_lock(&head->mutex);
 			mutex_unlock(&head->mutex);
 			btrfs_put_delayed_ref(&head->node);
-			goto again;
+			goto search_again;
 		}
+		spin_lock(&head->lock);
 		if (head->extent_op && head->extent_op->update_flags)
 			extent_flags |= head->extent_op->flags_to_set;
 		else
 			BUG_ON(num_refs == 0);
 
 		num_refs += head->node.ref_mod;
+		spin_unlock(&head->lock);
 		mutex_unlock(&head->mutex);
 	}
 	spin_unlock(&delayed_refs->lock);
@@ -901,7 +1019,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 				ret = btrfs_next_leaf(root, path);
 				if (ret < 0)
 					return ret;
-				BUG_ON(ret > 0);
+				BUG_ON(ret > 0); /* Corruption */
 				leaf = path->nodes[0];
 			}
 			btrfs_item_key_to_cpu(leaf, &found_key,
@@ -917,7 +1035,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 			break;
 		}
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
 		new_size += sizeof(*bi);
@@ -927,10 +1045,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 				new_size + extra_size, 1);
 	if (ret < 0)
 		return ret;
-	BUG_ON(ret);
+	BUG_ON(ret); /* Corruption */
 
-	ret = btrfs_extend_item(trans, root, path, new_size);
-	BUG_ON(ret);
+	btrfs_extend_item(root, path, new_size);
 
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -960,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -1025,7 +1142,7 @@ again:
 			return 0;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 		key.type = BTRFS_EXTENT_REF_V0_KEY;
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0) {
 			err = ret;
@@ -1063,7 +1180,7 @@ again:
 		if (match_extent_data_ref(leaf, ref, root_objectid,
 					  owner, offset)) {
 			if (recow) {
-				btrfs_release_path(root, path);
+				btrfs_release_path(path);
 				goto again;
 			}
 			err = 0;
@@ -1124,7 +1241,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 			if (match_extent_data_ref(leaf, ref, root_objectid,
 						  owner, offset))
 				break;
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			key.offset++;
 			ret = btrfs_insert_empty_item(trans, root, path, &key,
 						      size);
@@ -1150,14 +1267,14 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	ret = 0;
 fail:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	return ret;
 }
 
 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 					   struct btrfs_root *root,
 					   struct btrfs_path *path,
-					   int refs_to_drop)
+					   int refs_to_drop, int *last_ref)
 {
 	struct btrfs_key key;
 	struct btrfs_extent_data_ref *ref1 = NULL;
@@ -1193,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
+		*last_ref = 1;
 	} else {
 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1276,7 +1394,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
 		ret = -ENOENT;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 	if (ret == -ENOENT && parent) {
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		key.type = BTRFS_EXTENT_REF_V0_KEY;
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret > 0)
@@ -1305,7 +1423,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	return ret;
 }
 
@@ -1382,6 +1500,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	int want;
 	int ret;
 	int err = 0;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1393,12 +1513,54 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 		path->keep_locks = 1;
 	} else
 		extra_size = -1;
+
+	/*
+	 * Owner is our parent level, so we can just add one to get the level
+	 * for the block we are interested in.
+	 */
+	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = owner;
+	}
+
+again:
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
 	}
-	BUG_ON(ret);
+
+	/*
+	 * We may be a newly converted file system which still has the old fat
+	 * extent entries for metadata, so try and see if we have one of those.
+	 */
+	if (ret > 0 && skinny_metadata) {
+		skinny_metadata = false;
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes)
+				ret = 0;
+		}
+		if (ret) {
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	if (ret && !insert) {
+		err = -ENOENT;
+		goto out;
+	} else if (WARN_ON(ret)) {
+		err = -EIO;
+		goto out;
+	}
 
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1426,11 +1588,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	ptr = (unsigned long)(ei + 1);
 	end = (unsigned long)ei + item_size;
 
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 		ptr += sizeof(struct btrfs_tree_block_info);
 		BUG_ON(ptr > end);
-	} else {
-		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
 	}
 
 	err = -ENOENT;
@@ -1512,13 +1672,12 @@ out:
  * helper to add new inline back ref
  */
 static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_extent_inline_ref *iref,
-				u64 parent, u64 root_objectid,
-				u64 owner, u64 offset, int refs_to_add,
-				struct btrfs_delayed_extent_op *extent_op)
+void setup_inline_extent_backref(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
@@ -1528,7 +1687,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	u64 refs;
 	int size;
 	int type;
-	int ret;
 
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1537,8 +1695,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	type = extent_ref_type(parent, owner);
 	size = btrfs_extent_inline_ref_size(type);
 
-	ret = btrfs_extend_item(trans, root, path, size);
-	BUG_ON(ret);
+	btrfs_extend_item(root, path, size);
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
@@ -1573,7 +1730,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	return 0;
 }
 
 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1591,7 +1747,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
 	if (ret != -ENOENT)
 		return ret;
 
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	*ref_ret = NULL;
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -1608,12 +1764,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
  * helper to update/remove inline back ref
  */
 static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_extent_inline_ref *iref,
-				 int refs_to_mod,
-				 struct btrfs_delayed_extent_op *extent_op)
+void update_inline_extent_backref(struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  struct btrfs_extent_inline_ref *iref,
+				  int refs_to_mod,
+				  struct btrfs_delayed_extent_op *extent_op,
+				  int *last_ref)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
@@ -1624,7 +1780,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	int size;
 	int type;
-	int ret;
 	u64 refs;
 
 	leaf = path->nodes[0];
@@ -1658,6 +1813,7 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
 		else
 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
 	} else {
+		*last_ref = 1;
 		size =  btrfs_extent_inline_ref_size(type);
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 		ptr = (unsigned long)iref;
@@ -1666,11 +1822,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
 			memmove_extent_buffer(leaf, ptr, ptr + size,
 					      end - ptr - size);
 		item_size -= size;
-		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
-		BUG_ON(ret);
+		btrfs_truncate_item(root, path, item_size, 1);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	return 0;
 }
 
 static noinline_for_stack
@@ -1690,13 +1844,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 					   root_objectid, owner, offset, 1);
 	if (ret == 0) {
 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
-		ret = update_inline_extent_backref(trans, root, path, iref,
-						   refs_to_add, extent_op);
+		update_inline_extent_backref(root, path, iref,
+					     refs_to_add, extent_op, NULL);
 	} else if (ret == -ENOENT) {
-		ret = setup_inline_extent_backref(trans, root, path, iref,
-						  parent, root_objectid,
-						  owner, offset, refs_to_add,
-						  extent_op);
+		setup_inline_extent_backref(root, path, iref, parent,
+					    root_objectid, owner, offset,
+					    refs_to_add, extent_op);
+		ret = 0;
 	}
 	return ret;
 }
@@ -1724,76 +1878,101 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 struct btrfs_extent_inline_ref *iref,
-				 int refs_to_drop, int is_data)
+				 int refs_to_drop, int is_data, int *last_ref)
 {
-	int ret;
+	int ret = 0;
 
 	BUG_ON(!is_data && refs_to_drop != 1);
 	if (iref) {
-		ret = update_inline_extent_backref(trans, root, path, iref,
-						   -refs_to_drop, NULL);
+		update_inline_extent_backref(root, path, iref,
+					     -refs_to_drop, NULL, last_ref);
 	} else if (is_data) {
-		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+		ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+					     last_ref);
 	} else {
+		*last_ref = 1;
 		ret = btrfs_del_item(trans, root, path);
 	}
 	return ret;
 }
 
-static void btrfs_issue_discard(struct block_device *bdev,
+static int btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
+	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
-				u64 num_bytes)
+				u64 num_bytes, u64 *actual_bytes)
 {
 	int ret;
-	u64 map_length = num_bytes;
-	struct btrfs_multi_bio *multi = NULL;
+	u64 discarded_bytes = 0;
+	struct btrfs_bio *bbio = NULL;
 
-	if (!btrfs_test_opt(root, DISCARD))
-		return 0;
 
 	/* Tell the block device(s) that the sectors can be discarded */
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-			      bytenr, &map_length, &multi, 0);
+	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
+			      bytenr, &num_bytes, &bbio, 0);
+	/* Error condition is -ENOMEM */
 	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
+		struct btrfs_bio_stripe *stripe = bbio->stripes;
 		int i;
 
-		if (map_length > num_bytes)
-			map_length = num_bytes;
 
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
-			btrfs_issue_discard(stripe->dev->bdev,
-					    stripe->physical,
-					    map_length);
+		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+			if (!stripe->dev->can_discard)
+				continue;
+
+			ret = btrfs_issue_discard(stripe->dev->bdev,
+						  stripe->physical,
+						  stripe->length);
+			if (!ret)
+				discarded_bytes += stripe->length;
+			else if (ret != -EOPNOTSUPP)
+				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
+
+			/*
+			 * Just in case we get back EOPNOTSUPP for some reason,
+			 * just ignore the return value so we don't screw up
+			 * people calling discard_extent.
+			 */
+			ret = 0;
 		}
-		kfree(multi);
+		kfree(bbio);
 	}
 
+	if (actual_bytes)
+		*actual_bytes = discarded_bytes;
+
+
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
 	return ret;
 }
 
+/* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset)
+			 u64 root_objectid, u64 owner, u64 offset,
+			 int no_quota)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, no_quota);
 	}
 	return ret;
 }
@@ -1803,55 +1982,89 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  u64 bytenr, u64 num_bytes,
 				  u64 parent, u64 root_objectid,
 				  u64 owner, u64 offset, int refs_to_add,
+				  int no_quota,
 				  struct btrfs_delayed_extent_op *extent_op)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *item;
+	struct btrfs_key key;
 	u64 refs;
 	int ret;
-	int err = 0;
+	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+	if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+		no_quota = 1;
+
 	path->reada = 1;
 	path->leave_spinning = 1;
 	/* this will setup the path even if it fails to insert the back ref */
-	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
-					   path, bytenr, num_bytes, parent,
+	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+					   bytenr, num_bytes, parent,
 					   root_objectid, owner, offset,
 					   refs_to_add, extent_op);
-	if (ret == 0)
+	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
 		goto out;
-
-	if (ret != -EAGAIN) {
-		err = ret;
+	/*
+	 * Ok we were able to insert an inline extent and it appears to be a new
+	 * reference, deal with the qgroup accounting.
+	 */
+	if (!ret && !no_quota) {
+		ASSERT(root->fs_info->quota_enabled);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		item = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_item);
+		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+			type = BTRFS_QGROUP_OPER_ADD_SHARED;
+		btrfs_release_path(path);
+
+		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+					      bytenr, num_bytes, type, 0);
 		goto out;
 	}
 
+	/*
+	 * Ok we had -EAGAIN which means we didn't have space to insert and
+	 * inline extent ref, so just update the reference count and add a
+	 * normal backref.
+	 */
 	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, item);
+	if (refs)
+		type = BTRFS_QGROUP_OPER_ADD_SHARED;
 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, item);
 
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root->fs_info->extent_root, path);
+	btrfs_release_path(path);
+
+	if (!no_quota) {
+		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+					      bytenr, num_bytes, type, 0);
+		if (ret)
+			goto out;
+	}
 
 	path->reada = 1;
 	path->leave_spinning = 1;
-
 	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent, root_objectid,
 				    owner, offset, refs_to_add);
-	BUG_ON(ret);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
 out:
 	btrfs_free_path(path);
-	return err;
+	return ret;
 }
 
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
@@ -1872,16 +2085,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 
 	ref = btrfs_delayed_node_to_data_ref(node);
+	trace_run_delayed_data_ref(node, ref, node->action);
+
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 		parent = ref->parent;
-	else
-		ref_root = ref->root;
+	ref_root = ref->root;
 
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		if (extent_op) {
-			BUG_ON(extent_op->update_key);
+		if (extent_op)
 			flags |= extent_op->flags_to_set;
-		}
 		ret = alloc_reserved_file_extent(trans, root,
 						 parent, ref_root, flags,
 						 ref->objectid, ref->offset,
@@ -1891,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 					     node->num_bytes, parent,
 					     ref_root, ref->objectid,
 					     ref->offset, node->ref_mod,
-					     extent_op);
+					     node->no_quota, extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 		ret = __btrfs_free_extent(trans, root, node->bytenr,
 					  node->num_bytes, parent,
 					  ref_root, ref->objectid,
 					  ref->offset, node->ref_mod,
-					  extent_op);
+					  extent_op, node->no_quota);
 	} else {
 		BUG();
 	}
@@ -1934,15 +2146,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	int ret;
 	int err = 0;
+	int metadata = !extent_op->is_data;
+
+	if (trans->aborted)
+		return 0;
+
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		metadata = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	key.objectid = node->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = node->num_bytes;
 
+	if (metadata) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = extent_op->level;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = node->num_bytes;
+	}
+
+again:
 	path->reada = 1;
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
@@ -1952,8 +2178,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	if (ret > 0) {
-		err = -EIO;
-		goto out;
+		if (metadata) {
+			if (path->slots[0] > 0) {
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == node->bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == node->num_bytes)
+					ret = 0;
+			}
+			if (ret > 0) {
+				btrfs_release_path(path);
+				metadata = 0;
+
+				key.objectid = node->bytenr;
+				key.offset = node->num_bytes;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				goto again;
+			}
+		} else {
+			err = -EIO;
+			goto out;
+		}
 	}
 
 	leaf = path->nodes[0];
@@ -1991,34 +2238,44 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_key ins;
 	u64 parent = 0;
 	u64 ref_root = 0;
-
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	ref = btrfs_delayed_node_to_tree_ref(node);
+	trace_run_delayed_tree_ref(node, ref, node->action);
+
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 		parent = ref->parent;
-	else
-		ref_root = ref->root;
+	ref_root = ref->root;
+
+	ins.objectid = node->bytenr;
+	if (skinny_metadata) {
+		ins.offset = ref->level;
+		ins.type = BTRFS_METADATA_ITEM_KEY;
+	} else {
+		ins.offset = node->num_bytes;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+	}
 
 	BUG_ON(node->ref_mod != 1);
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		BUG_ON(!extent_op || !extent_op->update_flags ||
-		       !extent_op->update_key);
+		BUG_ON(!extent_op || !extent_op->update_flags);
 		ret = alloc_reserved_tree_block(trans, root,
 						parent, ref_root,
 						extent_op->flags_to_set,
 						&extent_op->key,
-						ref->level, &ins);
+						ref->level, &ins,
+						node->no_quota);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
 					     node->num_bytes, parent, ref_root,
-					     ref->level, 0, 1, extent_op);
+					     ref->level, 0, 1, node->no_quota,
+					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 		ret = __btrfs_free_extent(trans, root, node->bytenr,
 					  node->num_bytes, parent, ref_root,
-					  ref->level, 0, 1, extent_op);
+					  ref->level, 0, 1, extent_op,
+					  node->no_quota);
 	} else {
 		BUG();
 	}
@@ -2032,7 +2289,15 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       int insert_reserved)
 {
-	int ret;
+	int ret = 0;
+
+	if (trans->aborted) {
+		if (insert_reserved)
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
+		return 0;
+	}
+
 	if (btrfs_delayed_ref_is_head(node)) {
 		struct btrfs_delayed_ref_head *head;
 		/*
@@ -2043,6 +2308,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		 */
 		BUG_ON(extent_op);
 		head = btrfs_delayed_node_to_head(node);
+		trace_run_delayed_ref_head(node, head, node->action);
+
 		if (insert_reserved) {
 			btrfs_pin_extent(root, node->bytenr,
 					 node->num_bytes, 1);
@@ -2050,11 +2317,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 				ret = btrfs_del_csums(trans, root,
 						      node->bytenr,
 						      node->num_bytes);
-				BUG_ON(ret);
 			}
 		}
-		mutex_unlock(&head->mutex);
-		return 0;
+		return ret;
 	}
 
 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -2074,59 +2339,62 @@ static noinline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
 	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref;
-	int action = BTRFS_ADD_DELAYED_REF;
-again:
+	struct btrfs_delayed_ref_node *ref, *last = NULL;;
+
 	/*
 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
 	 * this prevents ref count from going down to zero when
 	 * there still are pending delayed ref.
 	 */
-	node = rb_prev(&head->node.rb_node);
-	while (1) {
-		if (!node)
-			break;
+	node = rb_first(&head->ref_root);
+	while (node) {
 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
 				rb_node);
-		if (ref->bytenr != head->node.bytenr)
-			break;
-		if (ref->action == action)
+		if (ref->action == BTRFS_ADD_DELAYED_REF)
 			return ref;
-		node = rb_prev(node);
+		else if (last == NULL)
+			last = ref;
+		node = rb_next(node);
 	}
-	if (action == BTRFS_ADD_DELAYED_REF) {
-		action = BTRFS_DROP_DELAYED_REF;
-		goto again;
-	}
-	return NULL;
+	return last;
 }
 
-static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct list_head *cluster)
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     unsigned long nr)
 {
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	struct btrfs_delayed_extent_op *extent_op;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	ktime_t start = ktime_get();
 	int ret;
-	int count = 0;
+	unsigned long count = 0;
+	unsigned long actual_count = 0;
 	int must_insert_reserved = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	while (1) {
 		if (!locked_ref) {
-			/* pick a new head ref from the cluster list */
-			if (list_empty(cluster))
+			if (count >= nr)
 				break;
 
-			locked_ref = list_entry(cluster->next,
-				     struct btrfs_delayed_ref_head, cluster);
+			spin_lock(&delayed_refs->lock);
+			locked_ref = btrfs_select_ref_head(trans);
+			if (!locked_ref) {
+				spin_unlock(&delayed_refs->lock);
+				break;
+			}
 
 			/* grab the lock that says we are going to process
 			 * all the refs for this head */
 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
-
+			spin_unlock(&delayed_refs->lock);
 			/*
 			 * we may have dropped the spin lock to get the head
 			 * mutex lock, and that might have given someone else
@@ -2141,6 +2409,37 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 
 		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 */
+		spin_lock(&locked_ref->lock);
+		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+					 locked_ref);
+
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+
+		if (ref && ref->seq &&
+		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+			spin_unlock(&locked_ref->lock);
+			btrfs_delayed_ref_unlock(locked_ref);
+			spin_lock(&delayed_refs->lock);
+			locked_ref->processing = 0;
+			delayed_refs->num_heads_ready++;
+			spin_unlock(&delayed_refs->lock);
+			locked_ref = NULL;
+			cond_resched();
+			count++;
+			continue;
+		}
+
+		/*
 		 * record the must insert reserved flag before we
 		 * drop the spin lock.
 		 */
@@ -2150,12 +2449,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		extent_op = locked_ref->extent_op;
 		locked_ref->extent_op = NULL;
 
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
-		ref = select_delayed_ref(locked_ref);
 		if (!ref) {
+
+
 			/* All delayed refs have been processed, Go ahead
 			 * and send the head node to run_one_delayed_ref,
 			 * so that any accounting fixes can happen
@@ -2163,45 +2459,308 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			ref = &locked_ref->node;
 
 			if (extent_op && must_insert_reserved) {
-				kfree(extent_op);
+				btrfs_free_delayed_extent_op(extent_op);
 				extent_op = NULL;
 			}
 
 			if (extent_op) {
-				spin_unlock(&delayed_refs->lock);
-
+				spin_unlock(&locked_ref->lock);
 				ret = run_delayed_extent_op(trans, root,
 							    ref, extent_op);
-				BUG_ON(ret);
-				kfree(extent_op);
-
-				cond_resched();
-				spin_lock(&delayed_refs->lock);
+				btrfs_free_delayed_extent_op(extent_op);
+
+				if (ret) {
+					/*
+					 * Need to reset must_insert_reserved if
+					 * there was an error so the abort stuff
+					 * can cleanup the reserved space
+					 * properly.
+					 */
+					if (must_insert_reserved)
+						locked_ref->must_insert_reserved = 1;
+					locked_ref->processing = 0;
+					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+					btrfs_delayed_ref_unlock(locked_ref);
+					return ret;
+				}
 				continue;
 			}
 
-			list_del_init(&locked_ref->cluster);
-			locked_ref = NULL;
+			/*
+			 * Need to drop our head ref lock and re-aqcuire the
+			 * delayed ref lock and then re-check to make sure
+			 * nobody got added.
+			 */
+			spin_unlock(&locked_ref->lock);
+			spin_lock(&delayed_refs->lock);
+			spin_lock(&locked_ref->lock);
+			if (rb_first(&locked_ref->ref_root) ||
+			    locked_ref->extent_op) {
+				spin_unlock(&locked_ref->lock);
+				spin_unlock(&delayed_refs->lock);
+				continue;
+			}
+			ref->in_tree = 0;
+			delayed_refs->num_heads--;
+			rb_erase(&locked_ref->href_node,
+				 &delayed_refs->href_root);
+			spin_unlock(&delayed_refs->lock);
+		} else {
+			actual_count++;
+			ref->in_tree = 0;
+			rb_erase(&ref->rb_node, &locked_ref->ref_root);
 		}
+		atomic_dec(&delayed_refs->num_entries);
 
-		ref->in_tree = 0;
-		rb_erase(&ref->rb_node, &delayed_refs->root);
-		delayed_refs->num_entries--;
-
-		spin_unlock(&delayed_refs->lock);
+		if (!btrfs_delayed_ref_is_head(ref)) {
+			/*
+			 * when we play the delayed ref, also correct the
+			 * ref_mod on head
+			 */
+			switch (ref->action) {
+			case BTRFS_ADD_DELAYED_REF:
+			case BTRFS_ADD_DELAYED_EXTENT:
+				locked_ref->node.ref_mod -= ref->ref_mod;
+				break;
+			case BTRFS_DROP_DELAYED_REF:
+				locked_ref->node.ref_mod += ref->ref_mod;
+				break;
+			default:
+				WARN_ON(1);
+			}
+		}
+		spin_unlock(&locked_ref->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
 					  must_insert_reserved);
-		BUG_ON(ret);
 
+		btrfs_free_delayed_extent_op(extent_op);
+		if (ret) {
+			locked_ref->processing = 0;
+			btrfs_delayed_ref_unlock(locked_ref);
+			btrfs_put_delayed_ref(ref);
+			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
+			return ret;
+		}
+
+		/*
+		 * If this node is a head, that means all the refs in this head
+		 * have been dealt with, and we will pick the next head to deal
+		 * with, so we must unlock the head and drop it from the cluster
+		 * list before we release it.
+		 */
+		if (btrfs_delayed_ref_is_head(ref)) {
+			btrfs_delayed_ref_unlock(locked_ref);
+			locked_ref = NULL;
+		}
 		btrfs_put_delayed_ref(ref);
-		kfree(extent_op);
 		count++;
-
 		cond_resched();
+	}
+
+	/*
+	 * We don't want to include ref heads since we can have empty ref heads
+	 * and those will drastically skew our runtime down since we just do
+	 * accounting, no actual extent tree updates.
+	 */
+	if (actual_count > 0) {
+		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+		u64 avg;
+
+		/*
+		 * We weigh the current average higher than our current runtime
+		 * to avoid large swings in the average.
+		 */
 		spin_lock(&delayed_refs->lock);
+		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+		avg = div64_u64(avg, 4);
+		fs_info->avg_delayed_ref_runtime = avg;
+		spin_unlock(&delayed_refs->lock);
+	}
+	return 0;
+}
+
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int alt = 1;
+	u64 middle;
+	u64 first = 0, last = 0;
+
+	n = rb_first(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		first = entry->bytenr;
+	}
+	n = rb_last(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		last = entry->bytenr;
+	}
+	n = root->rb_node;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		middle = entry->bytenr;
+
+		if (alt)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+
+		alt = 1 - alt;
+	}
+	return middle;
+}
+#endif
+
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+	u64 num_bytes;
+
+	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+			     sizeof(struct btrfs_extent_inline_ref));
+	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+	/*
+	 * We don't ever fill up leaves all the way so multiply by 2 just to be
+	 * closer to what we're really going to want to ouse.
+	 */
+	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *global_rsv;
+	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+	u64 num_bytes;
+	int ret = 0;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	num_heads = heads_to_leaves(root, num_heads);
+	if (num_heads > 1)
+		num_bytes += (num_heads - 1) * root->leafsize;
+	num_bytes <<= 1;
+	global_rsv = &root->fs_info->global_block_rsv;
+
+	/*
+	 * If we can't allocate any more chunks lets make sure we have _lots_ of
+	 * wiggle room since running delayed refs can create more delayed refs.
+	 */
+	if (global_rsv->space_info->full)
+		num_bytes <<= 1;
+
+	spin_lock(&global_rsv->lock);
+	if (global_rsv->reserved <= num_bytes)
+		ret = 1;
+	spin_unlock(&global_rsv->lock);
+	return ret;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 num_entries =
+		atomic_read(&trans->transaction->delayed_refs.num_entries);
+	u64 avg_runtime;
+	u64 val;
+
+	smp_mb();
+	avg_runtime = fs_info->avg_delayed_ref_runtime;
+	val = num_entries * avg_runtime;
+	if (num_entries * avg_runtime >= NSEC_PER_SEC)
+		return 1;
+	if (val >= NSEC_PER_SEC / 2)
+		return 2;
+
+	return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
+struct async_delayed_refs {
+	struct btrfs_root *root;
+	int count;
+	int error;
+	int sync;
+	struct completion wait;
+	struct btrfs_work work;
+};
+
+static void delayed_ref_async_start(struct btrfs_work *work)
+{
+	struct async_delayed_refs *async;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	async = container_of(work, struct async_delayed_refs, work);
+
+	trans = btrfs_join_transaction(async->root);
+	if (IS_ERR(trans)) {
+		async->error = PTR_ERR(trans);
+		goto done;
+	}
+
+	/*
+	 * trans->sync means that when we call end_transaciton, we won't
+	 * wait on delayed refs
+	 */
+	trans->sync = true;
+	ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+	if (ret)
+		async->error = ret;
+
+	ret = btrfs_end_transaction(trans, async->root);
+	if (ret && !async->error)
+		async->error = ret;
+done:
+	if (async->sync)
+		complete(&async->wait);
+	else
+		kfree(async);
+}
+
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+				 unsigned long count, int wait)
+{
+	struct async_delayed_refs *async;
+	int ret;
+
+	async = kmalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		return -ENOMEM;
+
+	async->root = root->fs_info->tree_root;
+	async->count = count;
+	async->error = 0;
+	if (wait)
+		async->sync = 1;
+	else
+		async->sync = 0;
+	init_completion(&async->wait);
+
+	btrfs_init_work(&async->work, delayed_ref_async_start,
+			NULL, NULL);
+
+	btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+
+	if (wait) {
+		wait_for_completion(&async->wait);
+		ret = async->error;
+		kfree(async);
+		return ret;
 	}
-	return count;
+	return 0;
 }
 
 /*
@@ -2210,97 +2769,101 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
  * 0, which means to process everything in the tree at the start
  * of the run (but not newly added entries), or it can be some target
  * number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count)
 {
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct list_head cluster;
+	struct btrfs_delayed_ref_head *head;
 	int ret;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
 
+	/* We'll clean this up in btrfs_cleanup_transaction */
+	if (trans->aborted)
+		return 0;
+
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	INIT_LIST_HEAD(&cluster);
-again:
-	spin_lock(&delayed_refs->lock);
 	if (count == 0) {
-		count = delayed_refs->num_entries * 2;
+		count = atomic_read(&delayed_refs->num_entries) * 2;
 		run_most = 1;
 	}
-	while (1) {
-		if (!(run_all || run_most) &&
-		    delayed_refs->num_heads_ready < 64)
-			break;
 
-		/*
-		 * go find something we can process in the rbtree.  We start at
-		 * the beginning of the tree, and then build a cluster
-		 * of refs to process starting at the first one we are able to
-		 * lock
-		 */
-		ret = btrfs_find_ref_cluster(trans, &cluster,
-					     delayed_refs->run_delayed_start);
-		if (ret)
-			break;
-
-		ret = run_clustered_refs(trans, root, &cluster);
-		BUG_ON(ret < 0);
-
-		count -= min_t(unsigned long, ret, count);
-
-		if (count == 0)
-			break;
+again:
+#ifdef SCRAMBLE_DELAYED_REFS
+	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+	ret = __btrfs_run_delayed_refs(trans, root, count);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
 	}
 
 	if (run_all) {
-		node = rb_first(&delayed_refs->root);
-		if (!node)
+		if (!list_empty(&trans->new_bgs))
+			btrfs_create_pending_block_groups(trans, root);
+
+		spin_lock(&delayed_refs->lock);
+		node = rb_first(&delayed_refs->href_root);
+		if (!node) {
+			spin_unlock(&delayed_refs->lock);
 			goto out;
+		}
 		count = (unsigned long)-1;
 
 		while (node) {
-			ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				       rb_node);
-			if (btrfs_delayed_ref_is_head(ref)) {
-				struct btrfs_delayed_ref_head *head;
+			head = rb_entry(node, struct btrfs_delayed_ref_head,
+					href_node);
+			if (btrfs_delayed_ref_is_head(&head->node)) {
+				struct btrfs_delayed_ref_node *ref;
 
-				head = btrfs_delayed_node_to_head(ref);
+				ref = &head->node;
 				atomic_inc(&ref->refs);
 
 				spin_unlock(&delayed_refs->lock);
+				/*
+				 * Mutex was contended, block until it's
+				 * released and try again
+				 */
 				mutex_lock(&head->mutex);
 				mutex_unlock(&head->mutex);
 
 				btrfs_put_delayed_ref(ref);
 				cond_resched();
 				goto again;
+			} else {
+				WARN_ON(1);
 			}
 			node = rb_next(node);
 		}
 		spin_unlock(&delayed_refs->lock);
-		schedule_timeout(1);
+		cond_resched();
 		goto again;
 	}
 out:
-	spin_unlock(&delayed_refs->lock);
+	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+	if (ret)
+		return ret;
+	assert_qgroups_uptodate(trans);
 	return 0;
 }
 
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data)
+				int level, int is_data)
 {
 	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 
-	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+	extent_op = btrfs_alloc_delayed_extent_op();
 	if (!extent_op)
 		return -ENOMEM;
 
@@ -2308,10 +2871,12 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	extent_op->update_flags = 1;
 	extent_op->update_key = 0;
 	extent_op->is_data = is_data ? 1 : 0;
+	extent_op->level = level;
 
-	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+					  num_bytes, extent_op);
 	if (ret)
-		kfree(extent_op);
+		btrfs_free_delayed_extent_op(extent_op);
 	return ret;
 }
 
@@ -2327,56 +2892,58 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 	struct rb_node *node;
 	int ret = 0;
 
-	ret = -ENOENT;
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 	head = btrfs_find_delayed_ref_head(trans, bytenr);
-	if (!head)
-		goto out;
+	if (!head) {
+		spin_unlock(&delayed_refs->lock);
+		return 0;
+	}
 
 	if (!mutex_trylock(&head->mutex)) {
 		atomic_inc(&head->node.refs);
 		spin_unlock(&delayed_refs->lock);
 
-		btrfs_release_path(root->fs_info->extent_root, path);
+		btrfs_release_path(path);
 
+		/*
+		 * Mutex was contended, block until it's released and let
+		 * caller try again
+		 */
 		mutex_lock(&head->mutex);
 		mutex_unlock(&head->mutex);
 		btrfs_put_delayed_ref(&head->node);
 		return -EAGAIN;
 	}
+	spin_unlock(&delayed_refs->lock);
 
-	node = rb_prev(&head->node.rb_node);
-	if (!node)
-		goto out_unlock;
-
-	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
-	if (ref->bytenr != bytenr)
-		goto out_unlock;
+	spin_lock(&head->lock);
+	node = rb_first(&head->ref_root);
+	while (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_next(node);
 
-	ret = 1;
-	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
-		goto out_unlock;
+		/* If it's a shared ref we know a cross reference exists */
+		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+			ret = 1;
+			break;
+		}
 
-	data_ref = btrfs_delayed_node_to_data_ref(ref);
+		data_ref = btrfs_delayed_node_to_data_ref(ref);
 
-	node = rb_prev(node);
-	if (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (ref->bytenr == bytenr)
-			goto out_unlock;
+		/*
+		 * If our ref doesn't match the one we're currently looking at
+		 * then we have a cross reference.
+		 */
+		if (data_ref->root != root->root_key.objectid ||
+		    data_ref->objectid != objectid ||
+		    data_ref->offset != offset) {
+			ret = 1;
+			break;
+		}
 	}
-
-	if (data_ref->root != root->root_key.objectid ||
-	    data_ref->objectid != objectid || data_ref->offset != offset)
-		goto out_unlock;
-
-	ret = 0;
-out_unlock:
+	spin_unlock(&head->lock);
 	mutex_unlock(&head->mutex);
-out:
-	spin_unlock(&delayed_refs->lock);
 	return ret;
 }
 
@@ -2401,7 +2968,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Corruption */
 
 	ret = -ENOENT;
 	if (path->slots[0] == 0)
@@ -2487,130 +3054,10 @@ out:
 	return ret;
 }
 
-#if 0
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct extent_buffer *buf, u32 nr_extents)
-{
-	struct btrfs_key key;
-	struct btrfs_file_extent_item *fi;
-	u64 root_gen;
-	u32 nritems;
-	int i;
-	int level;
-	int ret = 0;
-	int shared = 0;
-
-	if (!root->ref_cows)
-		return 0;
-
-	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-		shared = 0;
-		root_gen = root->root_key.offset;
-	} else {
-		shared = 1;
-		root_gen = trans->transid - 1;
-	}
-
-	level = btrfs_header_level(buf);
-	nritems = btrfs_header_nritems(buf);
-
-	if (level == 0) {
-		struct btrfs_leaf_ref *ref;
-		struct btrfs_extent_info *info;
-
-		ref = btrfs_alloc_leaf_ref(root, nr_extents);
-		if (!ref) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ref->root_gen = root_gen;
-		ref->bytenr = buf->start;
-		ref->owner = btrfs_header_owner(buf);
-		ref->generation = btrfs_header_generation(buf);
-		ref->nritems = nr_extents;
-		info = ref->extents;
-
-		for (i = 0; nr_extents > 0 && i < nritems; i++) {
-			u64 disk_bytenr;
-			btrfs_item_key_to_cpu(buf, &key, i);
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-				continue;
-			fi = btrfs_item_ptr(buf, i,
-					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(buf, fi) ==
-			    BTRFS_FILE_EXTENT_INLINE)
-				continue;
-			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (disk_bytenr == 0)
-				continue;
-
-			info->bytenr = disk_bytenr;
-			info->num_bytes =
-				btrfs_file_extent_disk_num_bytes(buf, fi);
-			info->objectid = key.objectid;
-			info->offset = key.offset;
-			info++;
-		}
-
-		ret = btrfs_add_leaf_ref(root, ref, shared);
-		if (ret == -EEXIST && shared) {
-			struct btrfs_leaf_ref *old;
-			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
-			BUG_ON(!old);
-			btrfs_remove_leaf_ref(root, old);
-			btrfs_free_leaf_ref(root, old);
-			ret = btrfs_add_leaf_ref(root, ref, shared);
-		}
-		WARN_ON(ret);
-		btrfs_free_leaf_ref(root, ref);
-	}
-out:
-	return ret;
-}
-
-/* when a block goes through cow, we update the reference counts of
- * everything that block points to.  The internal pointers of the block
- * can be in just about any order, and it is likely to have clusters of
- * things that are close together and clusters of things that are not.
- *
- * To help reduce the seeks that come with updating all of these reference
- * counts, sort them by byte number before actual updates are done.
- *
- * struct refsort is used to match byte number to slot in the btree block.
- * we sort based on the byte number and then use the slot to actually
- * find the item.
- *
- * struct refsort is smaller than strcut btrfs_item and smaller than
- * struct btrfs_key_ptr.  Since we're currently limited to the page size
- * for a btree block, there's no way for a kmalloc of refsorts for a
- * single node to be bigger than a page.
- */
-struct refsort {
-	u64 bytenr;
-	u32 slot;
-};
-
-/*
- * for passing into sort()
- */
-static int refsort_cmp(const void *a_void, const void *b_void)
-{
-	const struct refsort *a = a_void;
-	const struct refsort *b = b_void;
-
-	if (a->bytenr < b->bytenr)
-		return -1;
-	if (a->bytenr > b->bytenr)
-		return 1;
-	return 0;
-}
-#endif
-
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc)
+			   int full_backref, int inc, int no_quota)
 {
 	u64 bytenr;
 	u64 num_bytes;
@@ -2623,13 +3070,17 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, int);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+		return 0;
+#endif
 	ref_root = btrfs_header_owner(buf);
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
-	if (!root->ref_cows && level == 0)
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
 		return 0;
 
 	if (inc)
@@ -2660,34 +3111,34 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset);
+					   key.offset, no_quota);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = btrfs_level_size(root, level - 1);
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0);
+					   parent, ref_root, level - 1, 0,
+					   no_quota);
 			if (ret)
 				goto fail;
 		}
 	}
 	return 0;
 fail:
-	BUG();
 	return ret;
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int no_quota)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int no_quota)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2703,16 +3154,18 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 	if (ret < 0)
 		goto fail;
-	BUG_ON(ret);
+	BUG_ON(ret); /* Corruption */
 
 	leaf = path->nodes[0];
 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(extent_root, path);
+	btrfs_release_path(path);
 fail:
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
 		return ret;
+	}
 	return 0;
 
 }
@@ -2762,7 +3215,7 @@ again:
 	inode = lookup_free_space_inode(root, block_group, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
 		ret = PTR_ERR(inode);
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		goto out;
 	}
 
@@ -2779,6 +3232,13 @@ again:
 		goto again;
 	}
 
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
 	/*
 	 * We want to set the generation to 0, that way if anything goes wrong
 	 * from here on out we know not to trust this cache when we load up next
@@ -2789,32 +3249,41 @@ again:
 	WARN_ON(ret);
 
 	if (i_size_read(inode) > 0) {
-		ret = btrfs_truncate_free_space_cache(root, trans, path,
-						      inode);
+		ret = btrfs_check_trunc_cache_free_space(root,
+					&root->fs_info->global_block_rsv);
+		if (ret)
+			goto out_put;
+
+		ret = btrfs_truncate_free_space_cache(root, trans, inode);
 		if (ret)
 			goto out_put;
 	}
 
 	spin_lock(&block_group->lock);
-	if (block_group->cached != BTRFS_CACHE_FINISHED) {
-		/* We're not cached, don't bother trying to write stuff out */
+	if (block_group->cached != BTRFS_CACHE_FINISHED ||
+	    !btrfs_test_opt(root, SPACE_CACHE) ||
+	    block_group->delalloc_bytes) {
+		/*
+		 * don't bother trying to write stuff out _if_
+		 * a) we're not cached,
+		 * b) we're with nospace_cache mount option.
+		 */
 		dcs = BTRFS_DC_WRITTEN;
 		spin_unlock(&block_group->lock);
 		goto out_put;
 	}
 	spin_unlock(&block_group->lock);
 
-	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+	/*
+	 * Try to preallocate enough space based on how big the block group is.
+	 * Keep in mind this has to include any pinned space which could end up
+	 * taking up quite a bit since it's not folded into the other space
+	 * cache.
+	 */
+	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
 	if (!num_pages)
 		num_pages = 1;
 
-	/*
-	 * Just to make absolutely sure we have enough space, we're going to
-	 * preallocate 12 pages worth of space for each block group.  In
-	 * practice we ought to use at most 8, but we need extra space so we can
-	 * add our header and have a terminator between the extents and the
-	 * bitmaps.
-	 */
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
@@ -2828,12 +3297,15 @@ again:
 	if (!ret)
 		dcs = BTRFS_DC_SETUP;
 	btrfs_free_reserved_data_space(inode, num_pages);
+
 out_put:
 	iput(inode);
 out_free:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 out:
 	spin_lock(&block_group->lock);
+	if (!ret && dcs == BTRFS_DC_SETUP)
+		block_group->cache_generation = trans->transid;
 	block_group->disk_cache_state = dcs;
 	spin_unlock(&block_group->lock);
 
@@ -2875,7 +3347,8 @@ again:
 		if (last == 0) {
 			err = btrfs_run_delayed_refs(trans, root,
 						     (unsigned long)-1);
-			BUG_ON(err);
+			if (err) /* File system offline */
+				goto out;
 		}
 
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2902,8 +3375,9 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 
 		err = write_one_cache_group(trans, root, path, cache);
-		BUG_ON(err);
 		btrfs_put_block_group(cache);
+		if (err) /* File system offline */
+			goto out;
 	}
 
 	while (1) {
@@ -2915,7 +3389,8 @@ again:
 		if (last == 0) {
 			err = btrfs_run_delayed_refs(trans, root,
 						     (unsigned long)-1);
-			BUG_ON(err);
+			if (err) /* File system offline */
+				goto out;
 		}
 
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2940,20 +3415,21 @@ again:
 			continue;
 		}
 
-		btrfs_write_out_cache(root, trans, cache, path);
+		err = btrfs_write_out_cache(root, trans, cache, path);
 
 		/*
 		 * If we didn't have an error then the cache state is still
 		 * NEED_WRITE, so we can set it to WRITTEN.
 		 */
-		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
 		last = cache->key.objectid + cache->key.offset;
 		btrfs_put_block_group(cache);
 	}
+out:
 
 	btrfs_free_path(path);
-	return 0;
+	return err;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2969,6 +3445,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
 	return readonly;
 }
 
+static const char *alloc_name(u64 flags)
+{
+	switch (flags) {
+	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+		return "mixed";
+	case BTRFS_BLOCK_GROUP_METADATA:
+		return "metadata";
+	case BTRFS_BLOCK_GROUP_DATA:
+		return "data";
+	case BTRFS_BLOCK_GROUP_SYSTEM:
+		return "system";
+	default:
+		WARN_ON(1);
+		return "invalid-combination";
+	};
+}
+
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     u64 total_bytes, u64 bytes_used,
 			     struct btrfs_space_info **space_info)
@@ -2976,6 +3469,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	struct btrfs_space_info *found;
 	int i;
 	int factor;
+	int ret;
 
 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
 		     BTRFS_BLOCK_GROUP_RAID10))
@@ -2999,13 +3493,17 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
+	ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
-	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-				BTRFS_BLOCK_GROUP_SYSTEM |
-				BTRFS_BLOCK_GROUP_METADATA);
+	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 	found->total_bytes = total_bytes;
 	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
@@ -3015,30 +3513,78 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_readonly = 0;
 	found->bytes_may_use = 0;
 	found->full = 0;
-	found->force_alloc = 0;
+	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+	found->chunk_alloc = 0;
+	found->flush = 0;
+	init_waitqueue_head(&found->wait);
+
+	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+				    info->space_info_kobj, "%s",
+				    alloc_name(found->flags));
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
 	*space_info = found;
 	list_add_rcu(&found->list, &info->space_info);
-	atomic_set(&found->caching_threads, 0);
-	return 0;
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		info->data_sinfo = found;
+
+	return ret;
 }
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-				   BTRFS_BLOCK_GROUP_RAID1 |
-				   BTRFS_BLOCK_GROUP_RAID10 |
-				   BTRFS_BLOCK_GROUP_DUP);
-	if (extra_flags) {
-		if (flags & BTRFS_BLOCK_GROUP_DATA)
-			fs_info->avail_data_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_METADATA)
-			fs_info->avail_metadata_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-			fs_info->avail_system_alloc_bits |= extra_flags;
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits |= extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	u64 target = 0;
+
+	if (!bctl)
+		return 0;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA &&
+	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
 	}
+
+	return target;
 }
 
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	/*
 	 * we add in the count of missing devices because we want
@@ -3047,48 +3593,76 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	 */
 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
+	u64 target;
+	u64 tmp;
 
+	/*
+	 * see if restripe for this chunk_type is in progress, if so
+	 * try to reduce to the target profile
+	 */
+	spin_lock(&root->fs_info->balance_lock);
+	target = get_restripe_target(root->fs_info, flags);
+	if (target) {
+		/* pick target profile only if it's already available */
+		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+			spin_unlock(&root->fs_info->balance_lock);
+			return extended_to_chunk(target);
+		}
+	}
+	spin_unlock(&root->fs_info->balance_lock);
+
+	/* First, mask out the RAID levels which aren't possible */
 	if (num_devices == 1)
-		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+			   BTRFS_BLOCK_GROUP_RAID5);
+	if (num_devices < 3)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
 	if (num_devices < 4)
 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
 
-	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		      BTRFS_BLOCK_GROUP_RAID10))) {
-		flags &= ~BTRFS_BLOCK_GROUP_DUP;
-	}
+	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+	flags &= ~tmp;
 
-	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
-		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
-	}
+	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+		tmp = BTRFS_BLOCK_GROUP_RAID6;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+		tmp = BTRFS_BLOCK_GROUP_RAID5;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+		tmp = BTRFS_BLOCK_GROUP_RAID10;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+		tmp = BTRFS_BLOCK_GROUP_RAID1;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+		tmp = BTRFS_BLOCK_GROUP_RAID0;
 
-	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
-	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
-	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
-	     (flags & BTRFS_BLOCK_GROUP_DUP)))
-		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
-	return flags;
+	return extended_to_chunk(flags | tmp);
 }
 
-static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
 {
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		flags |= root->fs_info->avail_data_alloc_bits &
-			 root->fs_info->data_alloc_profile;
-	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		flags |= root->fs_info->avail_system_alloc_bits &
-			 root->fs_info->system_alloc_profile;
-	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		flags |= root->fs_info->avail_metadata_alloc_bits &
-			 root->fs_info->metadata_alloc_profile;
+	unsigned seq;
+	u64 flags;
+
+	do {
+		flags = orig_flags;
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			flags |= root->fs_info->avail_data_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			flags |= root->fs_info->avail_system_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			flags |= root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
 	u64 flags;
+	u64 ret;
 
 	if (data)
 		flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3097,13 +3671,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 	else
 		flags = BTRFS_BLOCK_GROUP_METADATA;
 
-	return get_alloc_profile(root, flags);
-}
-
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-						       BTRFS_BLOCK_GROUP_DATA);
+	ret = get_alloc_profile(root, flags);
+	return ret;
 }
 
 /*
@@ -3114,18 +3683,19 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 used;
 	int ret = 0, committed = 0, alloc_chunk = 1;
 
 	/* make sure bytes are sectorsize aligned */
-	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	bytes = ALIGN(bytes, root->sectorsize);
 
-	if (root == root->fs_info->tree_root) {
-		alloc_chunk = 0;
+	if (btrfs_is_free_space_inode(inode)) {
 		committed = 1;
+		ASSERT(current->journal_info);
 	}
 
-	data_sinfo = BTRFS_I(inode)->space_info;
+	data_sinfo = fs_info->data_sinfo;
 	if (!data_sinfo)
 		goto alloc;
 
@@ -3146,17 +3716,27 @@ again:
 		if (!data_sinfo->full && alloc_chunk) {
 			u64 alloc_target;
 
-			data_sinfo->force_alloc = 1;
+			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
 			spin_unlock(&data_sinfo->lock);
 alloc:
 			alloc_target = btrfs_get_alloc_profile(root, 1);
-			trans = btrfs_join_transaction(root, 1);
+			/*
+			 * It is ugly that we don't call nolock join
+			 * transaction for the free space inode case here.
+			 * But it is safe because we only do the data space
+			 * reservation for the free space cache in the
+			 * transaction context, the common join transaction
+			 * just increase the counter of the current transaction
+			 * handler, doesn't try to acquire the trans_lock of
+			 * the fs.
+			 */
+			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     bytes + 2 * 1024 * 1024,
-					     alloc_target, 0);
+					     alloc_target,
+					     CHUNK_ALLOC_NO_FORCE);
 			btrfs_end_transaction(trans, root);
 			if (ret < 0) {
 				if (ret != -ENOSPC)
@@ -3165,19 +3745,28 @@ alloc:
 					goto commit_trans;
 			}
 
-			if (!data_sinfo) {
-				btrfs_set_inode_space_info(root, inode);
-				data_sinfo = BTRFS_I(inode)->space_info;
-			}
+			if (!data_sinfo)
+				data_sinfo = fs_info->data_sinfo;
+
 			goto again;
 		}
+
+		/*
+		 * If we don't have enough pinned space to deal with this
+		 * allocation don't bother committing the transaction.
+		 */
+		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+					   bytes) < 0)
+			committed = 1;
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
 commit_trans:
-		if (!committed && !root->fs_info->open_ioctl_trans) {
+		if (!committed &&
+		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			committed = 1;
-			trans = btrfs_join_transaction(root, 1);
+
+			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
 			ret = btrfs_commit_transaction(trans, root);
@@ -3186,31 +3775,21 @@ commit_trans:
 			goto again;
 		}
 
-#if 0 /* I hope we never need this code again, just in case */
-		printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-		       "%llu bytes_reserved, " "%llu bytes_pinned, "
-		       "%llu bytes_readonly, %llu may use %llu total\n",
-		       (unsigned long long)bytes,
-		       (unsigned long long)data_sinfo->bytes_used,
-		       (unsigned long long)data_sinfo->bytes_reserved,
-		       (unsigned long long)data_sinfo->bytes_pinned,
-		       (unsigned long long)data_sinfo->bytes_readonly,
-		       (unsigned long long)data_sinfo->bytes_may_use,
-		       (unsigned long long)data_sinfo->total_bytes);
-#endif
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
-	BTRFS_I(inode)->reserved_bytes += bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
 }
 
 /*
- * called when we are clearing an delalloc extent from the
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
+ * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
@@ -3218,12 +3797,14 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	struct btrfs_space_info *data_sinfo;
 
 	/* make sure bytes are sectorsize aligned */
-	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	bytes = ALIGN(bytes, root->sectorsize);
 
-	data_sinfo = BTRFS_I(inode)->space_info;
+	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
+	WARN_ON(data_sinfo->bytes_may_use < bytes);
 	data_sinfo->bytes_may_use -= bytes;
-	BTRFS_I(inode)->reserved_bytes -= bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, bytes, 0);
 	spin_unlock(&data_sinfo->lock);
 }
 
@@ -3235,69 +3816,159 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-			found->force_alloc = 1;
+			found->force_alloc = CHUNK_ALLOC_FORCE;
 	}
 	rcu_read_unlock();
 }
 
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+	return (global->size << 1);
+}
+
 static int should_alloc_chunk(struct btrfs_root *root,
-			      struct btrfs_space_info *sinfo, u64 alloc_bytes)
+			      struct btrfs_space_info *sinfo, int force)
 {
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
 	u64 thresh;
 
-	if (sinfo->bytes_used + sinfo->bytes_reserved +
-	    alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-		return 0;
+	if (force == CHUNK_ALLOC_FORCE)
+		return 1;
 
-	if (sinfo->bytes_used + sinfo->bytes_reserved +
-	    alloc_bytes < div_factor(num_bytes, 8))
-		return 0;
+	/*
+	 * We need to take into account the global rsv because for all intents
+	 * and purposes it's used space.  Don't worry about locking the
+	 * global_rsv, it doesn't change except when the transaction commits.
+	 */
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+		num_allocated += calc_global_rsv_need_space(global_rsv);
 
-	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+	/*
+	 * in limited mode, we want to have some free space up to
+	 * about 1% of the FS size.
+	 */
+	if (force == CHUNK_ALLOC_LIMITED) {
+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+		thresh = max_t(u64, 64 * 1024 * 1024,
+			       div_factor_fine(thresh, 1));
 
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
-		return 0;
+		if (num_bytes - num_allocated < thresh)
+			return 1;
+	}
 
+	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+		return 0;
 	return 1;
 }
 
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+	u64 num_dev;
+
+	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+		    BTRFS_BLOCK_GROUP_RAID0 |
+		    BTRFS_BLOCK_GROUP_RAID5 |
+		    BTRFS_BLOCK_GROUP_RAID6))
+		num_dev = root->fs_info->fs_devices->rw_devices;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_dev = 2;
+	else
+		num_dev = 1;	/* DUP or single */
+
+	/* metadata for updaing devices and chunk tree */
+	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
+
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, u64 type)
+{
+	struct btrfs_space_info *info;
+	u64 left;
+	u64 thresh;
+
+	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	spin_lock(&info->lock);
+	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+		info->bytes_reserved - info->bytes_readonly;
+	spin_unlock(&info->lock);
+
+	thresh = get_system_chunk_thresh(root, type);
+	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
+			left, thresh, type);
+		dump_space_info(info, 0, 0);
+	}
+
+	if (left < thresh) {
+		u64 flags;
+
+		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+		btrfs_alloc_chunk(trans, root, flags);
+	}
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force)
+			  struct btrfs_root *extent_root, u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	int wait_for_alloc = 0;
 	int ret = 0;
 
-	mutex_lock(&fs_info->chunk_mutex);
-
-	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+	/* Don't re-enter if we're already allocating a chunk */
+	if (trans->allocating_chunk)
+		return -ENOSPC;
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
 		ret = update_space_info(extent_root->fs_info, flags,
 					0, 0, &space_info);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM */
 	}
-	BUG_ON(!space_info);
+	BUG_ON(!space_info); /* Logic error */
 
+again:
 	spin_lock(&space_info->lock);
-	if (space_info->force_alloc)
-		force = 1;
+	if (force < space_info->force_alloc)
+		force = space_info->force_alloc;
 	if (space_info->full) {
+		if (should_alloc_chunk(extent_root, space_info, force))
+			ret = -ENOSPC;
+		else
+			ret = 0;
 		spin_unlock(&space_info->lock);
-		goto out;
+		return ret;
 	}
 
-	if (!force && !should_alloc_chunk(extent_root, space_info,
-					  alloc_bytes)) {
+	if (!should_alloc_chunk(extent_root, space_info, force)) {
 		spin_unlock(&space_info->lock);
-		goto out;
+		return 0;
+	} else if (space_info->chunk_alloc) {
+		wait_for_alloc = 1;
+	} else {
+		space_info->chunk_alloc = 1;
 	}
+
 	spin_unlock(&space_info->lock);
 
+	mutex_lock(&fs_info->chunk_mutex);
+
+	/*
+	 * The chunk_mutex is held throughout the entirety of a chunk
+	 * allocation, so once we've acquired the chunk_mutex we know that the
+	 * other guy is done and we need to recheck and see if we should
+	 * allocate.
+	 */
+	if (wait_for_alloc) {
+		mutex_unlock(&fs_info->chunk_mutex);
+		wait_for_alloc = 0;
+		goto again;
+	}
+
+	trans->allocating_chunk = true;
+
 	/*
 	 * If we have mixed data/metadata chunks we want to make sure we keep
 	 * allocating mixed chunks instead of individual chunks.
@@ -3317,123 +3988,472 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			force_metadata_allocation(fs_info);
 	}
 
+	/*
+	 * Check if we have enough space in SYSTEM chunk because we may need
+	 * to update devices.
+	 */
+	check_system_chunk(trans, extent_root, flags);
+
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
+	trans->allocating_chunk = false;
+
 	spin_lock(&space_info->lock);
+	if (ret < 0 && ret != -ENOSPC)
+		goto out;
 	if (ret)
 		space_info->full = 1;
 	else
 		ret = 1;
-	space_info->force_alloc = 0;
-	spin_unlock(&space_info->lock);
+
+	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 out:
-	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+	space_info->chunk_alloc = 0;
+	spin_unlock(&space_info->lock);
+	mutex_unlock(&fs_info->chunk_mutex);
 	return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+			  struct btrfs_space_info *space_info, u64 bytes,
+			  enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	u64 profile = btrfs_get_alloc_profile(root, 0);
+	u64 space_size;
+	u64 avail;
+	u64 used;
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly;
+
+	/*
+	 * We only want to allow over committing if we have lots of actual space
+	 * free, but if we don't have enough space to handle the global reserve
+	 * space then we could end up having a real enospc problem when trying
+	 * to allocate a chunk or some other such important allocation.
+	 */
+	spin_lock(&global_rsv->lock);
+	space_size = calc_global_rsv_need_space(global_rsv);
+	spin_unlock(&global_rsv->lock);
+	if (used + space_size >= space_info->total_bytes)
+		return 0;
+
+	used += space_info->bytes_may_use;
+
+	spin_lock(&root->fs_info->free_chunk_lock);
+	avail = root->fs_info->free_chunk_space;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
+	/*
+	 * If we have dup, raid1 or raid10 then only half of the free
+	 * space is actually useable.  For raid56, the space info used
+	 * doesn't include the parity drive, so we don't have to
+	 * change the math
+	 */
+	if (profile & (BTRFS_BLOCK_GROUP_DUP |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID10))
+		avail >>= 1;
+
+	/*
+	 * If we aren't flushing all things, let us overcommit up to
+	 * 1/2th of the space. If we can flush, don't let us overcommit
+	 * too much, let it overcommit up to 1/8 of the space.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL)
+		avail >>= 3;
+	else
+		avail >>= 1;
+
+	if (used + bytes < space_info->total_bytes + avail)
+		return 1;
+	return 0;
+}
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+					 unsigned long nr_pages, int nr_items)
+{
+	struct super_block *sb = root->fs_info->sb;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+		up_read(&sb->s_umount);
+	} else {
+		/*
+		 * We needn't worry the filesystem going from r/w to r/o though
+		 * we don't acquire ->s_umount mutex, because the filesystem
+		 * should guarantee the delalloc inodes list be empty after
+		 * the filesystem is readonly(all dirty pages are written to
+		 * the disk).
+		 */
+		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
+		if (!current->journal_info)
+			btrfs_wait_ordered_roots(root->fs_info, nr_items);
+	}
+}
+
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+	u64 bytes;
+	int nr;
+
+	bytes = btrfs_calc_trans_metadata_size(root, 1);
+	nr = (int)div64_u64(to_reclaim, bytes);
+	if (!nr)
+		nr = 1;
+	return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM	(256 * 1024)
+
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim, int sync)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+			    bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
-	u64 reserved;
+	struct btrfs_trans_handle *trans;
+	u64 delalloc_bytes;
 	u64 max_reclaim;
-	u64 reclaimed = 0;
 	long time_left;
-	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
-	int loops = 0;
-	unsigned long progress;
+	unsigned long nr_pages;
+	int loops;
+	int items;
+	enum btrfs_reserve_flush_enum flush;
+
+	/* Calc the number of the pages we need flush for space reservation */
+	items = calc_reclaim_items_nr(root, to_reclaim);
+	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
-	smp_mb();
-	reserved = space_info->bytes_reserved;
-	progress = space_info->reservation_progress;
-
-	if (reserved == 0)
-		return 0;
-
-	max_reclaim = min(reserved, to_reclaim);
+	delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
+	if (delalloc_bytes == 0) {
+		if (trans)
+			return;
+		if (wait_ordered)
+			btrfs_wait_ordered_roots(root->fs_info, items);
+		return;
+	}
 
-	while (loops < 1024) {
-		/* have the flusher threads jump in and do some IO */
-		smp_mb();
-		nr_pages = min_t(unsigned long, nr_pages,
-		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+	loops = 0;
+	while (delalloc_bytes && loops < 3) {
+		max_reclaim = min(delalloc_bytes, to_reclaim);
+		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
+		/*
+		 * We need to wait for the async pages to actually start before
+		 * we do anything.
+		 */
+		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+		if (!max_reclaim)
+			goto skip_async;
 
+		if (max_reclaim <= nr_pages)
+			max_reclaim = 0;
+		else
+			max_reclaim -= nr_pages;
+
+		wait_event(root->fs_info->async_submit_wait,
+			   atomic_read(&root->fs_info->async_delalloc_pages) <=
+			   (int)max_reclaim);
+skip_async:
+		if (!trans)
+			flush = BTRFS_RESERVE_FLUSH_ALL;
+		else
+			flush = BTRFS_RESERVE_NO_FLUSH;
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved)
-			reclaimed += reserved - space_info->bytes_reserved;
-		reserved = space_info->bytes_reserved;
+		if (can_overcommit(root, space_info, orig, flush)) {
+			spin_unlock(&space_info->lock);
+			break;
+		}
 		spin_unlock(&space_info->lock);
 
 		loops++;
+		if (wait_ordered && !trans) {
+			btrfs_wait_ordered_roots(root->fs_info, items);
+		} else {
+			time_left = schedule_timeout_killable(1);
+			if (time_left)
+				break;
+		}
+		delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
+	}
+}
 
-		if (reserved == 0 || reclaimed >= max_reclaim)
-			break;
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+				  struct btrfs_space_info *space_info,
+				  u64 bytes, int force)
+{
+	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+	struct btrfs_trans_handle *trans;
 
-		if (trans && trans->transaction->blocked)
-			return -EAGAIN;
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	if (trans)
+		return -EAGAIN;
 
-		time_left = schedule_timeout_interruptible(1);
+	if (force)
+		goto commit;
 
-		/* We were interrupted, exit */
-		if (time_left)
-			break;
+	/* See if there is enough pinned space to make this reservation */
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes) >= 0)
+		goto commit;
 
-		/* we've kicked the IO a few times, if anything has been freed,
-		 * exit.  There is no sense in looping here for a long time
-		 * when we really need to commit the transaction, or there are
-		 * just too many writers without enough free space
-		 */
+	/*
+	 * See if there is some space in the delayed insertion reservation for
+	 * this reservation.
+	 */
+	if (space_info != delayed_rsv->space_info)
+		return -ENOSPC;
 
-		if (loops > 3) {
-			smp_mb();
-			if (progress != space_info->reservation_progress)
-				break;
+	spin_lock(&delayed_rsv->lock);
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes - delayed_rsv->size) >= 0) {
+		spin_unlock(&delayed_rsv->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&delayed_rsv->lock);
+
+commit:
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return -ENOSPC;
+
+	return btrfs_commit_transaction(trans, root);
+}
+
+enum flush_state {
+	FLUSH_DELAYED_ITEMS_NR	=	1,
+	FLUSH_DELAYED_ITEMS	=	2,
+	FLUSH_DELALLOC		=	3,
+	FLUSH_DELALLOC_WAIT	=	4,
+	ALLOC_CHUNK		=	5,
+	COMMIT_TRANS		=	6,
+};
+
+static int flush_space(struct btrfs_root *root,
+		       struct btrfs_space_info *space_info, u64 num_bytes,
+		       u64 orig_bytes, int state)
+{
+	struct btrfs_trans_handle *trans;
+	int nr;
+	int ret = 0;
+
+	switch (state) {
+	case FLUSH_DELAYED_ITEMS_NR:
+	case FLUSH_DELAYED_ITEMS:
+		if (state == FLUSH_DELAYED_ITEMS_NR)
+			nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+		else
+			nr = -1;
+
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = btrfs_run_delayed_items_nr(trans, root, nr);
+		btrfs_end_transaction(trans, root);
+		break;
+	case FLUSH_DELALLOC:
+	case FLUSH_DELALLOC_WAIT:
+		shrink_delalloc(root, num_bytes * 2, orig_bytes,
+				state == FLUSH_DELALLOC_WAIT);
+		break;
+	case ALLOC_CHUNK:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
 		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     btrfs_get_alloc_profile(root, 0),
+				     CHUNK_ALLOC_NO_FORCE);
+		btrfs_end_transaction(trans, root);
+		if (ret == -ENOSPC)
+			ret = 0;
+		break;
+	case COMMIT_TRANS:
+		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+		break;
+	default:
+		ret = -ENOSPC;
+		break;
+	}
 
+	return ret;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
+				 struct btrfs_space_info *space_info)
+{
+	u64 used;
+	u64 expected;
+	u64 to_reclaim;
+
+	to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
+				16 * 1024 * 1024);
+	spin_lock(&space_info->lock);
+	if (can_overcommit(root, space_info, to_reclaim,
+			   BTRFS_RESERVE_FLUSH_ALL)) {
+		to_reclaim = 0;
+		goto out;
 	}
-	return reclaimed >= to_reclaim;
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+	       space_info->bytes_pinned + space_info->bytes_readonly +
+	       space_info->bytes_may_use;
+	if (can_overcommit(root, space_info, 1024 * 1024,
+			   BTRFS_RESERVE_FLUSH_ALL))
+		expected = div_factor_fine(space_info->total_bytes, 95);
+	else
+		expected = div_factor_fine(space_info->total_bytes, 90);
+
+	if (used > expected)
+		to_reclaim = used - expected;
+	else
+		to_reclaim = 0;
+	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+				     space_info->bytes_reserved);
+out:
+	spin_unlock(&space_info->lock);
+
+	return to_reclaim;
 }
 
-/*
- * Retries tells us how many times we've called reserve_metadata_bytes.  The
- * idea is if this is the first call (retries == 0) then we will add to our
- * reserved count if we can't make the allocation in order to hold our place
- * while we go and try and free up space.  That way for retries > 1 we don't try
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
+static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
+					struct btrfs_fs_info *fs_info, u64 used)
+{
+	return (used >= div_factor_fine(space_info->total_bytes, 98) &&
+		!btrfs_fs_closing(fs_info) &&
+		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+				       struct btrfs_fs_info *fs_info)
+{
+	u64 used;
+
+	spin_lock(&space_info->lock);
+	used = space_info->bytes_used + space_info->bytes_reserved +
+	       space_info->bytes_pinned + space_info->bytes_readonly +
+	       space_info->bytes_may_use;
+	if (need_do_async_reclaim(space_info, fs_info, used)) {
+		spin_unlock(&space_info->lock);
+		return 1;
+	}
+	spin_unlock(&space_info->lock);
+
+	return 0;
+}
+
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_space_info *space_info;
+	u64 to_reclaim;
+	int flush_state;
+
+	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+						      space_info);
+	if (!to_reclaim)
+		return;
+
+	flush_state = FLUSH_DELAYED_ITEMS_NR;
+	do {
+		flush_space(fs_info->fs_root, space_info, to_reclaim,
+			    to_reclaim, flush_state);
+		flush_state++;
+		if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+			return;
+	} while (flush_state <= COMMIT_TRANS);
+
+	if (btrfs_need_do_async_reclaim(space_info, fs_info))
+		queue_work(system_unbound_wq, work);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
  *
- * However if we don't intend to retry this reservation, pass -1 as retries so
- * that it short circuits this logic.
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
  */
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
+static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes,
+				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	u64 unused;
+	u64 used;
 	u64 num_bytes = orig_bytes;
-	int retries = 0;
+	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 	int ret = 0;
-	bool reserved = false;
-	bool committed = false;
+	bool flushing = false;
 
 again:
-	ret = -ENOSPC;
-	if (reserved)
-		num_bytes = 0;
-
+	ret = 0;
 	spin_lock(&space_info->lock);
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	/*
+	 * We only want to wait if somebody other than us is flushing and we
+	 * are actually allowed to flush all things.
+	 */
+	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+	       space_info->flush) {
+		spin_unlock(&space_info->lock);
+		/*
+		 * If we have a trans handle we can't wait because the flusher
+		 * may have to commit the transaction, which would mean we would
+		 * deadlock since we are waiting for the flusher to finish, but
+		 * hold the current transaction open.
+		 */
+		if (current->journal_info)
+			return -EAGAIN;
+		ret = wait_event_killable(space_info->wait, !space_info->flush);
+		/* Must have been killed, return */
+		if (ret)
+			return -EINTR;
+
+		spin_lock(&space_info->lock);
+	}
+
+	ret = -ENOSPC;
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
 
 	/*
 	 * The idea here is that we've not already over-reserved the block group
@@ -3442,11 +4462,11 @@ again:
 	 * lets start flushing stuff first and then come back and try to make
 	 * our reservation.
 	 */
-	if (unused <= space_info->total_bytes) {
-		unused = space_info->total_bytes - unused;
-		if (unused >= num_bytes) {
-			if (!reserved)
-				space_info->bytes_reserved += orig_bytes;
+	if (used <= space_info->total_bytes) {
+		if (used + orig_bytes <= space_info->total_bytes) {
+			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+				"space_info", space_info->flags, orig_bytes, 1);
 			ret = 0;
 		} else {
 			/*
@@ -3462,91 +4482,102 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
-		num_bytes = unused - space_info->total_bytes +
-			(orig_bytes * (retries + 1));
+		num_bytes = used - space_info->total_bytes +
+			(orig_bytes * 2);
+	}
+
+	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+		space_info->bytes_may_use += orig_bytes;
+		trace_btrfs_space_reservation(root->fs_info, "space_info",
+					      space_info->flags, orig_bytes,
+					      1);
+		ret = 0;
 	}
 
 	/*
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
 	 * stealing it from us.
+	 *
+	 * We make the other tasks wait for the flush only when we can flush
+	 * all things.
 	 */
-	if (ret && !reserved) {
-		space_info->bytes_reserved += orig_bytes;
-		reserved = true;
+	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+		flushing = true;
+		space_info->flush = 1;
+	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		used += orig_bytes;
+		if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+		    !work_busy(&root->fs_info->async_reclaim_work))
+			queue_work(system_unbound_wq,
+				   &root->fs_info->async_reclaim_work);
 	}
-
 	spin_unlock(&space_info->lock);
 
-	if (!ret)
-		return 0;
-
-	if (!flush)
+	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
 		goto out;
 
-	/*
-	 * We do synchronous shrinking since we don't actually unreserve
-	 * metadata until after the IO is completed.
-	 */
-	ret = shrink_delalloc(trans, root, num_bytes, 1);
-	if (ret > 0)
-		return 0;
-	else if (ret < 0)
-		goto out;
-
-	/*
-	 * So if we were overcommitted it's possible that somebody else flushed
-	 * out enough space and we simply didn't have enough space to reclaim,
-	 * so go back around and try again.
-	 */
-	if (retries < 2) {
-		retries++;
-		goto again;
-	}
+	ret = flush_space(root, space_info, num_bytes, orig_bytes,
+			  flush_state);
+	flush_state++;
 
-	spin_lock(&space_info->lock);
 	/*
-	 * Not enough space to be reclaimed, don't bother committing the
-	 * transaction.
+	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+	 * would happen. So skip delalloc flush.
 	 */
-	if (space_info->bytes_pinned < orig_bytes)
-		ret = -ENOSPC;
-	spin_unlock(&space_info->lock);
-	if (ret)
-		goto out;
-
-	ret = -EAGAIN;
-	if (trans || committed)
-		goto out;
+	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+	    (flush_state == FLUSH_DELALLOC ||
+	     flush_state == FLUSH_DELALLOC_WAIT))
+		flush_state = ALLOC_CHUNK;
 
-	ret = -ENOSPC;
-	trans = btrfs_join_transaction(root, 1);
-	if (IS_ERR(trans))
-		goto out;
-	ret = btrfs_commit_transaction(trans, root);
-	if (!ret) {
-		trans = NULL;
-		committed = true;
+	if (!ret)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+		 flush_state < COMMIT_TRANS)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+		 flush_state <= COMMIT_TRANS)
 		goto again;
-	}
 
 out:
-	if (reserved) {
+	if (ret == -ENOSPC &&
+	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+		struct btrfs_block_rsv *global_rsv =
+			&root->fs_info->global_block_rsv;
+
+		if (block_rsv != global_rsv &&
+		    !block_rsv_use_bytes(global_rsv, orig_bytes))
+			ret = 0;
+	}
+	if (ret == -ENOSPC)
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      space_info->flags, orig_bytes, 1);
+	if (flushing) {
 		spin_lock(&space_info->lock);
-		space_info->bytes_reserved -= orig_bytes;
+		space_info->flush = 0;
+		wake_up_all(&space_info->wait);
 		spin_unlock(&space_info->lock);
 	}
-
 	return ret;
 }
 
-static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root)
+static struct btrfs_block_rsv *get_block_rsv(
+					const struct btrfs_trans_handle *trans,
+					const struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-	if (root->ref_cows)
+	struct btrfs_block_rsv *block_rsv = NULL;
+
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		block_rsv = trans->block_rsv;
-	else
+
+	if (root == root->fs_info->csum_root && trans->adding_csums)
+		block_rsv = trans->block_rsv;
+
+	if (root == root->fs_info->uuid_root)
+		block_rsv = trans->block_rsv;
+
+	if (!block_rsv)
 		block_rsv = root->block_rsv;
 
 	if (!block_rsv)
@@ -3582,8 +4613,34 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 	spin_unlock(&block_rsv->lock);
 }
 
-void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
-			     struct btrfs_block_rsv *dest, u64 num_bytes)
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 min_bytes;
+
+	if (global_rsv->space_info != dest->space_info)
+		return -ENOSPC;
+
+	spin_lock(&global_rsv->lock);
+	min_bytes = div_factor(global_rsv->size, min_factor);
+	if (global_rsv->reserved < min_bytes + num_bytes) {
+		spin_unlock(&global_rsv->lock);
+		return -ENOSPC;
+	}
+	global_rsv->reserved -= num_bytes;
+	if (global_rsv->reserved < global_rsv->size)
+		global_rsv->full = 0;
+	spin_unlock(&global_rsv->lock);
+
+	block_rsv_add_bytes(dest, num_bytes, 1);
+	return 0;
+}
+
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_rsv *block_rsv,
+				    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 
@@ -3617,8 +4674,9 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		}
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
-			space_info->bytes_reserved -= num_bytes;
-			space_info->reservation_progress++;
+			space_info->bytes_may_use -= num_bytes;
+			trace_btrfs_space_reservation(fs_info, "space_info",
+					space_info->flags, num_bytes, 0);
 			spin_unlock(&space_info->lock);
 		}
 	}
@@ -3637,16 +4695,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
 	return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
-	atomic_set(&rsv->usage, 1);
-	rsv->priority = 6;
-	INIT_LIST_HEAD(&rsv->list);
+	rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3655,7 +4712,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 	if (!block_rsv)
 		return NULL;
 
-	btrfs_init_block_rsv(block_rsv);
+	btrfs_init_block_rsv(block_rsv, type);
 	block_rsv->space_info = __find_space_info(fs_info,
 						  BTRFS_BLOCK_GROUP_METADATA);
 	return block_rsv;
@@ -3664,38 +4721,22 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
-	if (rsv && atomic_dec_and_test(&rsv->usage)) {
-		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		if (!rsv->durable)
-			kfree(rsv);
-	}
-}
-
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv)
-{
-	block_rsv->durable = 1;
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
+	if (!rsv)
+		return;
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3704,56 +4745,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor)
+int btrfs_block_rsv_check(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
 	u64 num_bytes = 0;
-	int commit_trans = 0;
 	int ret = -ENOSPC;
 
 	if (!block_rsv)
 		return 0;
 
 	spin_lock(&block_rsv->lock);
-	if (min_factor > 0)
-		num_bytes = div_factor(block_rsv->size, min_factor);
-	if (min_reserved > num_bytes)
-		num_bytes = min_reserved;
+	num_bytes = div_factor(block_rsv->size, min_factor);
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	spin_unlock(&block_rsv->lock);
 
-	if (block_rsv->reserved >= num_bytes) {
+	return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = min_reserved;
+	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
-	} else {
+	else
 		num_bytes -= block_rsv->reserved;
-		if (block_rsv->durable &&
-		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-			commit_trans = 1;
-	}
 	spin_unlock(&block_rsv->lock);
+
 	if (!ret)
 		return 0;
 
-	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     num_bytes, 0);
-		if (!ret) {
-			block_rsv_add_bytes(block_rsv, num_bytes, 0);
-			return 0;
-		}
-	}
-
-	if (commit_trans) {
-		if (trans)
-			return -EAGAIN;
-
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		ret = btrfs_commit_transaction(trans, root);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
 	}
 
-	return -ENOSPC;
+	return ret;
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3768,10 +4805,11 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
 			     u64 num_bytes)
 {
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
-	if (global_rsv->full || global_rsv == block_rsv ||
+	if (global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+				num_bytes);
 }
 
 /*
@@ -3785,24 +4823,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 	u64 num_bytes;
 	u64 meta_used;
 	u64 data_used;
-	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
-#if 0
-	/*
-	 * per tree used space accounting can be inaccuracy, so we
-	 * can't rely on it.
-	 */
-	spin_lock(&fs_info->extent_root->accounting_lock);
-	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
-	spin_unlock(&fs_info->extent_root->accounting_lock);
-
-	spin_lock(&fs_info->csum_root->accounting_lock);
-	num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
-	spin_unlock(&fs_info->csum_root->accounting_lock);
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
-	spin_lock(&fs_info->tree_root->accounting_lock);
-	num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
-	spin_unlock(&fs_info->tree_root->accounting_lock);
-#endif
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
 	spin_lock(&sinfo->lock);
 	data_used = sinfo->bytes_used;
@@ -3833,10 +4855,10 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	num_bytes = calc_global_metadata_size(fs_info);
 
-	spin_lock(&block_rsv->lock);
 	spin_lock(&sinfo->lock);
+	spin_lock(&block_rsv->lock);
 
-	block_rsv->size = num_bytes;
+	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -3845,22 +4867,22 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
-		sinfo->bytes_reserved += num_bytes;
+		sinfo->bytes_may_use += num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+				      sinfo->flags, num_bytes, 1);
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
-		sinfo->bytes_reserved -= num_bytes;
-		sinfo->reservation_progress++;
+		sinfo->bytes_may_use -= num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+				      sinfo->flags, num_bytes, 0);
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
 	}
-#if 0
-	printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
-		block_rsv->size, block_rsv->reserved);
-#endif
-	spin_unlock(&sinfo->lock);
+
 	spin_unlock(&block_rsv->lock);
+	spin_unlock(&sinfo->lock);
 }
 
 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -3869,79 +4891,55 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	fs_info->chunk_block_rsv.space_info = space_info;
-	fs_info->chunk_block_rsv.priority = 10;
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->global_block_rsv.priority = 10;
-	fs_info->global_block_rsv.refill_used = 1;
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
-	fs_info->empty_block_rsv.priority = 10;
+	fs_info->delayed_block_rsv.space_info = space_info;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	if (fs_info->quota_root)
+		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
 	update_global_block_rsv(fs_info);
 }
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+				(u64)-1);
 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
-}
-
-static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
-{
-	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
-		3 * num_items;
-}
-
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 int num_items)
-{
-	u64 num_bytes;
-	int ret;
-
-	if (num_items == 0 || root->fs_info->chunk_root == root)
-		return 0;
-
-	num_bytes = calc_trans_metadata_size(root, num_items);
-	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-				  num_bytes);
-	if (!ret) {
-		trans->bytes_reserved += num_bytes;
-		trans->block_rsv = &root->fs_info->trans_block_rsv;
-	}
-	return ret;
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
+	if (!trans->block_rsv)
+		return;
+
 	if (!trans->bytes_reserved)
 		return;
 
-	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
-	btrfs_block_rsv_release(root, trans->block_rsv,
-				trans->bytes_reserved);
+	trace_btrfs_space_reservation(root->fs_info, "transaction",
+				      trans->transid, trans->bytes_reserved, 0);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
+/* Can only return 0 or -ENOSPC */
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode)
 {
@@ -3950,115 +4948,386 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
 
 	/*
-	 * one for deleting orphan item, one for updating inode and
-	 * two for calling btrfs_truncate_inode_items.
-	 *
-	 * btrfs_truncate_inode_items is a delete operation, it frees
-	 * more space than it uses in most cases. So two units of
-	 * metadata space should be enough for calling it many times.
-	 * If all of the metadata space is used, we can commit
-	 * transaction and use space it freed.
+	 * We need to hold space in order to delete our orphan item once we've
+	 * added it, so this takes the reservation so we can release it later
+	 * when we are truly done with the orphan item.
 	 */
-	u64 num_bytes = calc_trans_metadata_size(root, 4);
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 1);
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
 void btrfs_orphan_release_metadata(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 num_bytes = calc_trans_metadata_size(root, 4);
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 0);
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int items,
+				     u64 *qgroup_reserved,
+				     bool use_global_rsv)
 {
-	struct btrfs_root *root = pending->root;
-	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
-	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
-	/*
-	 * two for root back/forward refs, two for directory entries
-	 * and one for root of the snapshot.
-	 */
-	u64 num_bytes = calc_trans_metadata_size(root, 5);
-	dst_rsv->space_info = src_rsv->space_info;
-	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+	u64 num_bytes;
+	int ret;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+
+	if (root->fs_info->quota_enabled) {
+		/* One for parent inode, two for dir entries */
+		num_bytes = 3 * root->leafsize;
+		ret = btrfs_qgroup_reserve(root, num_bytes);
+		if (ret)
+			return ret;
+	} else {
+		num_bytes = 0;
+	}
+
+	*qgroup_reserved = num_bytes;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, items);
+	rsv->space_info = __find_space_info(root->fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
+
+	if (ret == -ENOSPC && use_global_rsv)
+		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
+	if (ret) {
+		if (*qgroup_reserved)
+			btrfs_qgroup_free(root, *qgroup_reserved);
+	}
+
+	return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved)
+{
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	if (qgroup_reserved)
+		btrfs_qgroup_free(root, qgroup_reserved);
 }
 
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
+static unsigned drop_outstanding_extent(struct inode *inode)
 {
-	return num_bytes >>= 3;
+	unsigned drop_inode_space = 0;
+	unsigned dropped_extents = 0;
+
+	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
+	BTRFS_I(inode)->outstanding_extents--;
+
+	if (BTRFS_I(inode)->outstanding_extents == 0 &&
+	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+			       &BTRFS_I(inode)->runtime_flags))
+		drop_inode_space = 1;
+
+	/*
+	 * If we have more or the same amount of outsanding extents than we have
+	 * reserved then we need to leave the reserved extents count alone.
+	 */
+	if (BTRFS_I(inode)->outstanding_extents >=
+	    BTRFS_I(inode)->reserved_extents)
+		return drop_inode_space;
+
+	dropped_extents = BTRFS_I(inode)->reserved_extents -
+		BTRFS_I(inode)->outstanding_extents;
+	BTRFS_I(inode)->reserved_extents -= dropped_extents;
+	return dropped_extents + drop_inode_space;
+}
+
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *	reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+				   int reserve)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 csum_size;
+	int num_csums_per_leaf;
+	int num_csums;
+	int old_csums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+	    BTRFS_I(inode)->csum_bytes == 0)
+		return 0;
+
+	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	if (reserve)
+		BTRFS_I(inode)->csum_bytes += num_bytes;
+	else
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+	num_csums_per_leaf = (int)div64_u64(csum_size,
+					    sizeof(struct btrfs_csum_item) +
+					    sizeof(struct btrfs_disk_key));
+	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	num_csums = num_csums + num_csums_per_leaf - 1;
+	num_csums = num_csums / num_csums_per_leaf;
+
+	old_csums = old_csums + num_csums_per_leaf - 1;
+	old_csums = old_csums / num_csums_per_leaf;
+
+	/* No change, no need to reserve more */
+	if (old_csums == num_csums)
+		return 0;
+
+	if (reserve)
+		return btrfs_calc_trans_metadata_size(root,
+						      num_csums - old_csums);
+
+	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
 }
 
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
-	u64 to_reserve;
-	int nr_extents;
-	int ret;
+	u64 to_reserve = 0;
+	u64 csum_bytes;
+	unsigned nr_extents = 0;
+	int extra_reserve = 0;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+	int ret = 0;
+	bool delalloc_lock = true;
+	u64 to_free = 0;
+	unsigned dropped;
+
+	/* If we are a free space inode we need to not flush since we will be in
+	 * the middle of a transaction commit.  We also don't need the delalloc
+	 * mutex since we won't race with anybody.  We need this mostly to make
+	 * lockdep shut its filthy mouth.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		flush = BTRFS_RESERVE_NO_FLUSH;
+		delalloc_lock = false;
+	}
 
-	if (btrfs_transaction_in_commit(root->fs_info))
+	if (flush != BTRFS_RESERVE_NO_FLUSH &&
+	    btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
+	if (delalloc_lock)
+		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
-	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
-		nr_extents -= BTRFS_I(inode)->reserved_extents;
-		to_reserve = calc_trans_metadata_size(root, nr_extents);
-	} else {
-		nr_extents = 0;
-		to_reserve = 0;
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->outstanding_extents++;
+
+	if (BTRFS_I(inode)->outstanding_extents >
+	    BTRFS_I(inode)->reserved_extents)
+		nr_extents = BTRFS_I(inode)->outstanding_extents -
+			BTRFS_I(inode)->reserved_extents;
+
+	/*
+	 * Add an item to reserve for updating the inode when we complete the
+	 * delalloc io.
+	 */
+	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+		      &BTRFS_I(inode)->runtime_flags)) {
+		nr_extents++;
+		extra_reserve = 1;
+	}
+
+	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+	csum_bytes = BTRFS_I(inode)->csum_bytes;
+	spin_unlock(&BTRFS_I(inode)->lock);
+
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, num_bytes +
+					   nr_extents * root->leafsize);
+		if (ret)
+			goto out_fail;
+	}
+
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	if (unlikely(ret)) {
+		if (root->fs_info->quota_enabled)
+			btrfs_qgroup_free(root, num_bytes +
+						nr_extents * root->leafsize);
+		goto out_fail;
 	}
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-	to_reserve += calc_csum_metadata_size(inode, num_bytes);
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
-	if (ret)
-		return ret;
 
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	spin_lock(&BTRFS_I(inode)->lock);
+	if (extra_reserve) {
+		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+			&BTRFS_I(inode)->runtime_flags);
+		nr_extents--;
+	}
 	BTRFS_I(inode)->reserved_extents += nr_extents;
-	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
+	spin_unlock(&BTRFS_I(inode)->lock);
 
-	block_rsv_add_bytes(block_rsv, to_reserve, 1);
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
-	if (block_rsv->size > 512 * 1024 * 1024)
-		shrink_delalloc(NULL, root, to_reserve, 0);
+	if (to_reserve)
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), to_reserve, 1);
+	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
+
+out_fail:
+	spin_lock(&BTRFS_I(inode)->lock);
+	dropped = drop_outstanding_extent(inode);
+	/*
+	 * If the inodes csum_bytes is the same as the original
+	 * csum_bytes then we know we haven't raced with any free()ers
+	 * so we can just reduce our inodes csum bytes and carry on.
+	 */
+	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
+		calc_csum_metadata_size(inode, num_bytes, 0);
+	} else {
+		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
+		u64 bytes;
+
+		/*
+		 * This is tricky, but first we need to figure out how much we
+		 * free'd from any free-ers that occured during this
+		 * reservation, so we reset ->csum_bytes to the csum_bytes
+		 * before we dropped our lock, and then call the free for the
+		 * number of bytes that were freed while we were trying our
+		 * reservation.
+		 */
+		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
+		BTRFS_I(inode)->csum_bytes = csum_bytes;
+		to_free = calc_csum_metadata_size(inode, bytes, 0);
+
+
+		/*
+		 * Now we need to see how much we would have freed had we not
+		 * been making this reservation and our ->csum_bytes were not
+		 * artificially inflated.
+		 */
+		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
+		bytes = csum_bytes - orig_csum_bytes;
+		bytes = calc_csum_metadata_size(inode, bytes, 0);
+
+		/*
+		 * Now reset ->csum_bytes to what it should be.  If bytes is
+		 * more than to_free then we would have free'd more space had we
+		 * not had an artificially high ->csum_bytes, so we need to free
+		 * the remainder.  If bytes is the same or less then we don't
+		 * need to do anything, the other free-ers did the correct
+		 * thing.
+		 */
+		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
+		if (bytes > to_free)
+			to_free = bytes - to_free;
+		else
+			to_free = 0;
+	}
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (dropped)
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+	if (to_free) {
+		btrfs_block_rsv_release(root, block_rsv, to_free);
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), to_free, 0);
+	}
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+	return ret;
 }
 
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 to_free;
-	int nr_extents;
+	u64 to_free = 0;
+	unsigned dropped;
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
-	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
-
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
-	if (nr_extents < BTRFS_I(inode)->reserved_extents) {
-		nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
-		BTRFS_I(inode)->reserved_extents -= nr_extents;
-	} else {
-		nr_extents = 0;
-	}
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
+	spin_lock(&BTRFS_I(inode)->lock);
+	dropped = drop_outstanding_extent(inode);
 
-	to_free = calc_csum_metadata_size(inode, num_bytes);
-	if (nr_extents > 0)
-		to_free += calc_trans_metadata_size(root, nr_extents);
+	if (num_bytes)
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (dropped > 0)
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+	trace_btrfs_space_reservation(root->fs_info, "delalloc",
+				      btrfs_ino(inode), to_free, 0);
+	if (root->fs_info->quota_enabled) {
+		btrfs_qgroup_free(root, num_bytes +
+					dropped * root->leafsize);
+	}
 
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
 }
 
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
 	int ret;
@@ -4076,14 +5345,26 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 {
 	btrfs_delalloc_release_metadata(inode, num_bytes);
 	btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc)
 {
 	struct btrfs_block_group_cache *cache = NULL;
@@ -4094,19 +5375,19 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	int factor;
 
 	/* block accounting for super block */
-	spin_lock(&info->delalloc_lock);
-	old_val = btrfs_super_bytes_used(&info->super_copy);
+	spin_lock(&info->delalloc_root_lock);
+	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
-	spin_unlock(&info->delalloc_lock);
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
+	spin_unlock(&info->delalloc_root_lock);
 
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
-			return -1;
+			return -ENOENT;
 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
 				    BTRFS_BLOCK_GROUP_RAID1 |
 				    BTRFS_BLOCK_GROUP_RAID10))
@@ -4120,7 +5401,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		 * space back to the block group, otherwise we will leak space.
 		 */
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
-			cache_block_group(cache, trans, NULL, 1);
+			cache_block_group(cache, 1);
 
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
@@ -4128,7 +5409,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 
-		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		if (btrfs_test_opt(root, SPACE_CACHE) &&
 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
 
@@ -4140,7 +5421,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
-			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4171,6 +5451,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 	struct btrfs_block_group_cache *cache;
 	u64 bytenr;
 
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	bytenr = root->fs_info->first_logical_byte;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+
+	if (bytenr < (u64)-1)
+		return bytenr;
+
 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
 	if (!cache)
 		return 0;
@@ -4192,13 +5479,14 @@ static int pin_down_extent(struct btrfs_root *root,
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
-		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	if (reserved)
+		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
 	return 0;
 }
 
@@ -4211,7 +5499,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 
 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-	BUG_ON(!cache);
+	BUG_ON(!cache); /* Logic error */
 
 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
 
@@ -4220,49 +5508,170 @@ int btrfs_pin_extent(struct btrfs_root *root,
 }
 
 /*
- * update size of reserved extents. this function may return -EAGAIN
- * if 'reserve' is true or 'sinfo' is false.
+ * this function must be called within transaction
  */
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				 u64 num_bytes, int reserve, int sinfo)
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes)
 {
-	int ret = 0;
-	if (sinfo) {
-		struct btrfs_space_info *space_info = cache->space_info;
-		spin_lock(&space_info->lock);
-		spin_lock(&cache->lock);
-		if (reserve) {
-			if (cache->ro) {
-				ret = -EAGAIN;
-			} else {
-				cache->reserved += num_bytes;
-				space_info->bytes_reserved += num_bytes;
-			}
+	struct btrfs_block_group_cache *cache;
+	int ret;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!cache)
+		return -EINVAL;
+
+	/*
+	 * pull in the free space cache (if any) so that our pin
+	 * removes the free space from the cache.  We have load_only set
+	 * to one because the slow code to read in the free extents does check
+	 * the pinned extents.
+	 */
+	cache_block_group(cache, 1);
+
+	pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+	/* remove us from the free space cache (if we're there at all) */
+	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_caching_control *caching_ctl;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, start);
+	if (!block_group)
+		return -EINVAL;
+
+	cache_block_group(block_group, 0);
+	caching_ctl = get_caching_control(block_group);
+
+	if (!caching_ctl) {
+		/* Logic error */
+		BUG_ON(!block_group_cache_done(block_group));
+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
+	} else {
+		mutex_lock(&caching_ctl->mutex);
+
+		if (start >= caching_ctl->progress) {
+			ret = add_excluded_extent(root, start, num_bytes);
+		} else if (start + num_bytes <= caching_ctl->progress) {
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
 		} else {
-			if (cache->ro)
-				space_info->bytes_readonly += num_bytes;
-			cache->reserved -= num_bytes;
-			space_info->bytes_reserved -= num_bytes;
-			space_info->reservation_progress++;
+			num_bytes = caching_ctl->progress - start;
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			if (ret)
+				goto out_lock;
+
+			num_bytes = (start + num_bytes) -
+				caching_ctl->progress;
+			start = caching_ctl->progress;
+			ret = add_excluded_extent(root, start, num_bytes);
 		}
-		spin_unlock(&cache->lock);
-		spin_unlock(&space_info->lock);
-	} else {
-		spin_lock(&cache->lock);
+out_lock:
+		mutex_unlock(&caching_ctl->mutex);
+		put_caching_control(caching_ctl);
+	}
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key key;
+	int found_type;
+	int i;
+
+	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+		return 0;
+
+	for (i = 0; i < btrfs_header_nritems(eb); i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(eb, item);
+		if (found_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			continue;
+		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		__exclude_logged_extent(log, key.objectid, key.offset);
+	}
+
+	return 0;
+}
+
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:	The cache we are manipulating
+ * @num_bytes:	The number of bytes in question
+ * @reserve:	One of the reservation enums
+ * @delalloc:   The blocks are allocated for the delalloc write
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
+ */
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve, int delalloc)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+	int ret = 0;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (reserve != RESERVE_FREE) {
 		if (cache->ro) {
 			ret = -EAGAIN;
 		} else {
-			if (reserve)
-				cache->reserved += num_bytes;
-			else
-				cache->reserved -= num_bytes;
+			cache->reserved += num_bytes;
+			space_info->bytes_reserved += num_bytes;
+			if (reserve == RESERVE_ALLOC) {
+				trace_btrfs_space_reservation(cache->fs_info,
+						"space_info", space_info->flags,
+						num_bytes, 0);
+				space_info->bytes_may_use -= num_bytes;
+			}
+
+			if (delalloc)
+				cache->delalloc_bytes += num_bytes;
 		}
-		spin_unlock(&cache->lock);
+	} else {
+		if (cache->ro)
+			space_info->bytes_readonly += num_bytes;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+
+		if (delalloc)
+			cache->delalloc_bytes -= num_bytes;
 	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
 	return ret;
 }
 
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4270,7 +5679,7 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 	struct btrfs_caching_control *caching_ctl;
 	struct btrfs_block_group_cache *cache;
 
-	down_write(&fs_info->extent_commit_sem);
+	down_write(&fs_info->commit_root_sem);
 
 	list_for_each_entry_safe(caching_ctl, next,
 				 &fs_info->caching_block_groups, list) {
@@ -4289,25 +5698,28 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 	else
 		fs_info->pinned_extents = &fs_info->freed_extents[0];
 
-	up_write(&fs_info->extent_commit_sem);
+	up_write(&fs_info->commit_root_sem);
 
 	update_global_block_rsv(fs_info);
-	return 0;
 }
 
 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_space_info *space_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	u64 len;
+	bool readonly;
 
 	while (start <= end) {
+		readonly = false;
 		if (!cache ||
 		    start >= cache->key.objectid + cache->key.offset) {
 			if (cache)
 				btrfs_put_block_group(cache);
 			cache = btrfs_lookup_block_group(fs_info, start);
-			BUG_ON(!cache);
+			BUG_ON(!cache); /* Logic error */
 		}
 
 		len = cache->key.objectid + cache->key.offset - start;
@@ -4319,20 +5731,31 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		}
 
 		start += len;
+		space_info = cache->space_info;
 
-		spin_lock(&cache->space_info->lock);
+		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
-		cache->space_info->bytes_pinned -= len;
+		space_info->bytes_pinned -= len;
+		percpu_counter_add(&space_info->total_bytes_pinned, -len);
 		if (cache->ro) {
-			cache->space_info->bytes_readonly += len;
-		} else if (cache->reserved_pinned > 0) {
-			len = min(len, cache->reserved_pinned);
-			cache->reserved_pinned -= len;
-			cache->space_info->bytes_reserved += len;
+			space_info->bytes_readonly += len;
+			readonly = true;
 		}
 		spin_unlock(&cache->lock);
-		spin_unlock(&cache->space_info->lock);
+		if (!readonly && global_rsv->space_info == space_info) {
+			spin_lock(&global_rsv->lock);
+			if (!global_rsv->full) {
+				len = min(len, global_rsv->size -
+					  global_rsv->reserved);
+				global_rsv->reserved += len;
+				space_info->bytes_may_use += len;
+				if (global_rsv->reserved >= global_rsv->size)
+					global_rsv->full = 1;
+			}
+			spin_unlock(&global_rsv->lock);
+		}
+		spin_unlock(&space_info->lock);
 	}
 
 	if (cache)
@@ -4345,13 +5768,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *unpin;
-	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *next_rsv;
 	u64 start;
 	u64 end;
-	int idx;
 	int ret;
 
+	if (trans->aborted)
+		return 0;
+
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 		unpin = &fs_info->freed_extents[1];
 	else
@@ -4359,50 +5782,50 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
-		ret = btrfs_discard_extent(root, start, end + 1 - start);
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_discard_extent(root, start,
+						   end + 1 - start, NULL);
 
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		unpin_extent_range(root, start, end);
 		cond_resched();
 	}
 
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_for_each_entry_safe(block_rsv, next_rsv,
-				 &fs_info->durable_block_rsv_list, list) {
+	return 0;
+}
 
-		idx = trans->transid & 0x1;
-		if (block_rsv->freed[idx] > 0) {
-			block_rsv_add_bytes(block_rsv,
-					    block_rsv->freed[idx], 0);
-			block_rsv->freed[idx] = 0;
-		}
-		if (atomic_read(&block_rsv->usage) == 0) {
-			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+			     u64 owner, u64 root_objectid)
+{
+	struct btrfs_space_info *space_info;
+	u64 flags;
 
-			if (block_rsv->freed[0] == 0 &&
-			    block_rsv->freed[1] == 0) {
-				list_del_init(&block_rsv->list);
-				kfree(block_rsv);
-			}
-		} else {
-			btrfs_block_rsv_release(root, block_rsv, 0);
-		}
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+	} else {
+		flags = BTRFS_BLOCK_GROUP_DATA;
 	}
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
 
-	return 0;
+	space_info = __find_space_info(fs_info, flags);
+	BUG_ON(!space_info); /* Logic bug */
+	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
 }
 
+
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
 				u64 root_objectid, u64 owner_objectid,
 				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extent_op)
+				struct btrfs_delayed_extent_op *extent_op,
+				int no_quota)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -4418,6 +5841,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int num_to_del = 1;
 	u32 item_size;
 	u64 refs;
+	int last_ref = 0;
+	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (!info->quota_enabled || !is_fstree(root_objectid))
+		no_quota = 1;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4429,6 +5859,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
 	BUG_ON(!is_data && refs_to_drop != 1);
 
+	if (is_data)
+		skinny_metadata = 0;
+
 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
 				    bytenr, num_bytes, parent,
 				    root_objectid, owner_objectid,
@@ -4445,6 +5878,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				found_extent = 1;
 				break;
 			}
+			if (key.type == BTRFS_METADATA_ITEM_KEY &&
+			    key.offset == owner_objectid) {
+				found_extent = 1;
+				break;
+			}
 			if (path->slots[0] - extent_slot > 5)
 				break;
 			extent_slot--;
@@ -4458,36 +5896,73 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			BUG_ON(iref);
 			ret = remove_extent_backref(trans, extent_root, path,
 						    NULL, refs_to_drop,
-						    is_data);
-			BUG_ON(ret);
-			btrfs_release_path(extent_root, path);
+						    is_data, &last_ref);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
+			btrfs_release_path(path);
 			path->leave_spinning = 1;
 
 			key.objectid = bytenr;
 			key.type = BTRFS_EXTENT_ITEM_KEY;
 			key.offset = num_bytes;
 
+			if (!is_data && skinny_metadata) {
+				key.type = BTRFS_METADATA_ITEM_KEY;
+				key.offset = owner_objectid;
+			}
+
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
+			if (ret > 0 && skinny_metadata && path->slots[0]) {
+				/*
+				 * Couldn't find our skinny metadata item,
+				 * see if we have ye olde extent item.
+				 */
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == num_bytes)
+					ret = 0;
+			}
+
+			if (ret > 0 && skinny_metadata) {
+				skinny_metadata = false;
+				key.objectid = bytenr;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				key.offset = num_bytes;
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(trans, extent_root,
+							&key, path, -1, 1);
+			}
+
 			if (ret) {
-				printk(KERN_ERR "umm, got %d back from search"
-				       ", was looking for %llu\n", ret,
-				       (unsigned long long)bytenr);
-				btrfs_print_leaf(extent_root, path->nodes[0]);
+				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+					ret, bytenr);
+				if (ret > 0)
+					btrfs_print_leaf(extent_root,
+							 path->nodes[0]);
+			}
+			if (ret < 0) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
 			}
-			BUG_ON(ret);
 			extent_slot = path->slots[0];
 		}
-	} else {
+	} else if (WARN_ON(ret == -ENOENT)) {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
-		WARN_ON(1);
-		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu  owner %llu offset %llu\n",
-		       (unsigned long long)bytenr,
-		       (unsigned long long)parent,
-		       (unsigned long long)root_objectid,
-		       (unsigned long long)owner_objectid,
-		       (unsigned long long)owner_offset);
+		btrfs_err(info,
+			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
+			bytenr, parent, root_objectid, owner_objectid,
+			owner_offset);
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	} else {
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -4497,9 +5972,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(found_extent || extent_slot != path->slots[0]);
 		ret = convert_extent_item_v0(trans, extent_root, path,
 					     owner_objectid, 0);
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 
-		btrfs_release_path(extent_root, path);
+		btrfs_release_path(path);
 		path->leave_spinning = 1;
 
 		key.objectid = bytenr;
@@ -4509,12 +5987,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		ret = btrfs_search_slot(trans, extent_root, &key, path,
 					-1, 1);
 		if (ret) {
-			printk(KERN_ERR "umm, got %d back from search"
-			       ", was looking for %llu\n", ret,
-			       (unsigned long long)bytenr);
+			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+				ret, bytenr);
 			btrfs_print_leaf(extent_root, path->nodes[0]);
 		}
-		BUG_ON(ret);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
 		extent_slot = path->slots[0];
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -4523,7 +6004,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+	    key.type == BTRFS_EXTENT_ITEM_KEY) {
 		struct btrfs_tree_block_info *bi;
 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		bi = (struct btrfs_tree_block_info *)(ei + 1);
@@ -4531,10 +6013,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	}
 
 	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs < refs_to_drop);
+	if (refs < refs_to_drop) {
+		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
+			  "for bytenr %Lu", refs_to_drop, refs, bytenr);
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	}
 	refs -= refs_to_drop;
 
 	if (refs > 0) {
+		type = BTRFS_QGROUP_OPER_SUB_SHARED;
 		if (extent_op)
 			__run_delayed_extent_op(extent_op, leaf, ei);
 		/*
@@ -4550,9 +6039,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		if (found_extent) {
 			ret = remove_extent_backref(trans, extent_root, path,
 						    iref, refs_to_drop,
-						    is_data);
-			BUG_ON(ret);
+						    is_data, &last_ref);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
+		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+				 root_objectid);
 	} else {
 		if (found_extent) {
 			BUG_ON(is_data && refs_to_drop !=
@@ -4566,23 +6060,44 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		last_ref = 1;
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		BUG_ON(ret);
-		btrfs_release_path(extent_root, path);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+		btrfs_release_path(path);
 
 		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-			BUG_ON(ret);
-		} else {
-			invalidate_mapping_pages(info->btree_inode->i_mapping,
-			     bytenr >> PAGE_CACHE_SHIFT,
-			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
 
-		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-		BUG_ON(ret);
+		ret = update_block_group(root, bytenr, num_bytes, 0);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	/* Deal with the quota accounting */
+	if (!ret && last_ref && !no_quota) {
+		int mod_seq = 0;
+
+		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+		    type == BTRFS_QGROUP_OPER_SUB_SHARED)
+			mod_seq = 1;
+
+		ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+					      bytenr, num_bytes, type,
+					      mod_seq);
 	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -4598,30 +6113,22 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_node *ref;
-	struct rb_node *node;
 	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 	head = btrfs_find_delayed_ref_head(trans, bytenr);
 	if (!head)
-		goto out;
-
-	node = rb_prev(&head->node.rb_node);
-	if (!node)
-		goto out;
-
-	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		goto out_delayed_unlock;
 
-	/* there are still entries for this ref, we can't drop it */
-	if (ref->bytenr == bytenr)
+	spin_lock(&head->lock);
+	if (rb_first(&head->ref_root))
 		goto out;
 
 	if (head->extent_op) {
 		if (!head->must_insert_reserved)
 			goto out;
-		kfree(head->extent_op);
+		btrfs_free_delayed_extent_op(head->extent_op);
 		head->extent_op = NULL;
 	}
 
@@ -4637,19 +6144,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	 * ahead and process it.
 	 */
 	head->node.in_tree = 0;
-	rb_erase(&head->node.rb_node, &delayed_refs->root);
+	rb_erase(&head->href_node, &delayed_refs->href_root);
 
-	delayed_refs->num_entries--;
+	atomic_dec(&delayed_refs->num_entries);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
 	 * tree, so we just steal the ref the tree was holding.
 	 */
 	delayed_refs->num_heads--;
-	if (list_empty(&head->cluster))
+	if (head->processing == 0)
 		delayed_refs->num_heads_ready--;
-
-	list_del_init(&head->cluster);
+	head->processing = 0;
+	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
 
 	BUG_ON(head->extent_op);
@@ -4660,6 +6167,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	btrfs_put_delayed_ref(&head->node);
 	return ret;
 out:
+	spin_unlock(&head->lock);
+
+out_delayed_unlock:
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -4669,90 +6179,70 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref)
 {
-	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_group_cache *cache = NULL;
+	int pin = 1;
 	int ret;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-						parent, root->root_key.objectid,
-						btrfs_header_level(buf),
-						BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					buf->start, buf->len,
+					parent, root->root_key.objectid,
+					btrfs_header_level(buf),
+					BTRFS_DROP_DELAYED_REF, NULL, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 
 	if (!last_ref)
 		return;
 
-	block_rsv = get_block_rsv(trans, root);
 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-	if (block_rsv->space_info != cache->space_info)
-		goto out;
 
 	if (btrfs_header_generation(buf) == trans->transid) {
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, root, buf->start);
 			if (!ret)
-				goto pin;
+				goto out;
 		}
 
 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			pin_down_extent(root, cache, buf->start, buf->len, 1);
-			goto pin;
+			goto out;
 		}
 
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
-		ret = update_reserved_bytes(cache, buf->len, 0, 0);
-		if (ret == -EAGAIN) {
-			/* block group became read-only */
-			update_reserved_bytes(cache, buf->len, 0, 1);
-			goto out;
-		}
-
-		ret = 1;
-		spin_lock(&block_rsv->lock);
-		if (block_rsv->reserved < block_rsv->size) {
-			block_rsv->reserved += buf->len;
-			ret = 0;
-		}
-		spin_unlock(&block_rsv->lock);
-
-		if (ret) {
-			spin_lock(&cache->space_info->lock);
-			cache->space_info->bytes_reserved -= buf->len;
-			cache->space_info->reservation_progress++;
-			spin_unlock(&cache->space_info->lock);
-		}
-		goto out;
-	}
-pin:
-	if (block_rsv->durable && !cache->ro) {
-		ret = 0;
-		spin_lock(&cache->lock);
-		if (!cache->ro) {
-			cache->reserved_pinned += buf->len;
-			ret = 1;
-		}
-		spin_unlock(&cache->lock);
-
-		if (ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->freed[trans->transid & 0x1] += buf->len;
-			spin_unlock(&block_rsv->lock);
-		}
+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
+		pin = 0;
 	}
 out:
+	if (pin)
+		add_pinned_bytes(root->fs_info, buf->len,
+				 btrfs_header_level(buf),
+				 root->root_key.objectid);
+
+	/*
+	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
+	 * anymore.
+	 */
+	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
 	btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset)
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int no_quota)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+		return 0;
+#endif
+	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
@@ -4764,23 +6254,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
+					BTRFS_DROP_DELAYED_REF, NULL, no_quota);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, owner,
-					offset, BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+						num_bytes,
+						parent, root_objectid, owner,
+						offset, BTRFS_DROP_DELAYED_REF,
+						NULL, no_quota);
 	}
 	return ret;
 }
 
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+			struct btrfs_block_group_cache *cache,
+			u64 val, u64 num_bytes)
 {
-	u64 mask = ((u64)root->stripesize - 1);
-	u64 ret = (val + mask) & ~mask;
+	u64 ret = ALIGN(val, root->stripesize);
 	return ret;
 }
 
@@ -4794,106 +6286,198 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
  * for our min num_bytes.  Another option is to have it go ahead
  * and look in the rbtree for a free extent of a given size, but this
  * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
  */
-static noinline int
+static noinline void
 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 				u64 num_bytes)
 {
 	struct btrfs_caching_control *caching_ctl;
-	DEFINE_WAIT(wait);
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
-		return 0;
+		return;
 
 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
-		   (cache->free_space >= num_bytes));
+		   (cache->free_space_ctl->free_space >= num_bytes));
 
 	put_caching_control(caching_ctl);
-	return 0;
 }
 
 static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_caching_control *caching_ctl;
-	DEFINE_WAIT(wait);
+	int ret = 0;
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
-		return 0;
+		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 
 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
-
+	if (cache->cached == BTRFS_CACHE_ERROR)
+		ret = -EIO;
 	put_caching_control(caching_ctl);
-	return 0;
+	return ret;
 }
 
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+int __get_raid_index(u64 flags)
 {
-	int index;
-	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
-		index = 0;
-	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
-		index = 1;
-	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
-		index = 2;
-	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
-		index = 3;
-	else
-		index = 4;
-	return index;
+	if (flags & BTRFS_BLOCK_GROUP_RAID10)
+		return BTRFS_RAID_RAID10;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+		return BTRFS_RAID_RAID1;
+	else if (flags & BTRFS_BLOCK_GROUP_DUP)
+		return BTRFS_RAID_DUP;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+		return BTRFS_RAID_RAID0;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+		return BTRFS_RAID_RAID5;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+		return BTRFS_RAID_RAID6;
+
+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
+int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+	return __get_raid_index(cache->flags);
+}
+
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10]	= "raid10",
+	[BTRFS_RAID_RAID1]	= "raid1",
+	[BTRFS_RAID_DUP]	= "dup",
+	[BTRFS_RAID_RAID0]	= "raid0",
+	[BTRFS_RAID_SINGLE]	= "single",
+	[BTRFS_RAID_RAID5]	= "raid5",
+	[BTRFS_RAID_RAID6]	= "raid6",
+};
+
+static const char *get_raid_name(enum btrfs_raid_types type)
+{
+	if (type >= BTRFS_NR_RAID_TYPES)
+		return NULL;
+
+	return btrfs_raid_type_names[type];
 }
 
 enum btrfs_loop_type {
-	LOOP_FIND_IDEAL = 0,
-	LOOP_CACHING_NOWAIT = 1,
-	LOOP_CACHING_WAIT = 2,
-	LOOP_ALLOC_CHUNK = 3,
-	LOOP_NO_EMPTY_SIZE = 4,
+	LOOP_CACHING_NOWAIT = 0,
+	LOOP_CACHING_WAIT = 1,
+	LOOP_ALLOC_CHUNK = 2,
+	LOOP_NO_EMPTY_SIZE = 3,
 };
 
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+		       int delalloc)
+{
+	if (delalloc)
+		down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+		       int delalloc)
+{
+	btrfs_get_block_group(cache);
+	if (delalloc)
+		down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+		   struct btrfs_free_cluster *cluster,
+		   int delalloc)
+{
+	struct btrfs_block_group_cache *used_bg;
+	bool locked = false;
+again:
+	spin_lock(&cluster->refill_lock);
+	if (locked) {
+		if (used_bg == cluster->block_group)
+			return used_bg;
+
+		up_read(&used_bg->data_rwsem);
+		btrfs_put_block_group(used_bg);
+	}
+
+	used_bg = cluster->block_group;
+	if (!used_bg)
+		return NULL;
+
+	if (used_bg == block_group)
+		return used_bg;
+
+	btrfs_get_block_group(used_bg);
+
+	if (!delalloc)
+		return used_bg;
+
+	if (down_read_trylock(&used_bg->data_rwsem))
+		return used_bg;
+
+	spin_unlock(&cluster->refill_lock);
+	down_read(&used_bg->data_rwsem);
+	locked = true;
+	goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+			 int delalloc)
+{
+	if (delalloc)
+		up_read(&cache->data_rwsem);
+	btrfs_put_block_group(cache);
+}
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
- * ins->objectid == block start
+ * ins->objectid == start position
  * ins->flags = BTRFS_EXTENT_ITEM_KEY
- * ins->offset == number of blocks
+ * ins->offset == the size of the hole.
  * Any available blocks before search_start are skipped.
+ *
+ * If there is no suitable free space, we will record the max size of
+ * the free space extent currently.
  */
-static noinline int find_free_extent(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *orig_root,
+static noinline int find_free_extent(struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
-				     u64 search_start, u64 search_end,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     int data)
+				     u64 flags, int delalloc)
 {
 	int ret = 0;
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
+	u64 search_start = 0;
+	u64 max_extent_size = 0;
 	int empty_cluster = 2 * 1024 * 1024;
-	int allowed_chunk_alloc = 0;
-	int done_chunk_alloc = 0;
 	struct btrfs_space_info *space_info;
-	int last_ptr_loop = 0;
 	int loop = 0;
-	int index = 0;
-	bool found_uncached_bg = false;
+	int index = __get_raid_index(flags);
+	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
+		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
-	u64 ideal_cache_percent = 0;
-	u64 ideal_cache_offset = 0;
+	bool have_caching_bg = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 	ins->objectid = 0;
 	ins->offset = 0;
 
-	space_info = __find_space_info(root->fs_info, data);
+	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
+
+	space_info = __find_space_info(root->fs_info, flags);
 	if (!space_info) {
-		printk(KERN_ERR "No space info for %d\n", data);
+		btrfs_err(root->fs_info, "No space info for %llu", flags);
 		return -ENOSPC;
 	}
 
@@ -4904,16 +6488,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	if (btrfs_mixed_space_info(space_info))
 		use_cluster = false;
 
-	if (orig_root->ref_cows || empty_size)
-		allowed_chunk_alloc = 1;
-
-	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
 			empty_cluster = 64 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
 	    btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->data_alloc_cluster;
 	}
@@ -4932,7 +6513,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		empty_cluster = 0;
 
 	if (search_start == hint_byte) {
-ideal_cache:
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
 		/*
@@ -4942,9 +6522,8 @@ ideal_cache:
 		 * However if we are re-searching with an ideal block group
 		 * picked out then we don't care that the block group is cached.
 		 */
-		if (block_group && block_group_bits(block_group, data) &&
-		    (block_group->cached != BTRFS_CACHE_NO ||
-		     search_start == ideal_cache_offset)) {
+		if (block_group && block_group_bits(block_group, flags) &&
+		    block_group->cached != BTRFS_CACHE_NO) {
 			down_read(&space_info->groups_sem);
 			if (list_empty(&block_group->list) ||
 			    block_group->ro) {
@@ -4958,6 +6537,7 @@ ideal_cache:
 				up_read(&space_info->groups_sem);
 			} else {
 				index = get_block_group_index(block_group);
+				btrfs_lock_block_group(block_group, delalloc);
 				goto have_block_group;
 			}
 		} else if (block_group) {
@@ -4965,13 +6545,14 @@ ideal_cache:
 		}
 	}
 search:
+	have_caching_bg = false;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
 		u64 offset;
 		int cached;
 
-		btrfs_get_block_group(block_group);
+		btrfs_grab_block_group(block_group, delalloc);
 		search_start = block_group->key.objectid;
 
 		/*
@@ -4979,9 +6560,11 @@ search:
 		 * raid types, but we want to make sure we only allocate
 		 * for the proper type.
 		 */
-		if (!block_group_bits(block_group, data)) {
+		if (!block_group_bits(block_group, flags)) {
 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
 				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6 |
 				BTRFS_BLOCK_GROUP_RAID10;
 
 			/*
@@ -4989,142 +6572,129 @@ search:
 			 * doesn't provide them, bail.  This does allow us to
 			 * fill raid0 from raid1.
 			 */
-			if ((data & extra) && !(block_group->flags & extra))
+			if ((flags & extra) && !(block_group->flags & extra))
 				goto loop;
 		}
 
 have_block_group:
-		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
-			u64 free_percent;
-
-			ret = cache_block_group(block_group, trans,
-						orig_root, 1);
-			if (block_group->cached == BTRFS_CACHE_FINISHED)
-				goto have_block_group;
-
-			free_percent = btrfs_block_group_used(&block_group->item);
-			free_percent *= 100;
-			free_percent = div64_u64(free_percent,
-						 block_group->key.offset);
-			free_percent = 100 - free_percent;
-			if (free_percent > ideal_cache_percent &&
-			    likely(!block_group->ro)) {
-				ideal_cache_offset = block_group->key.objectid;
-				ideal_cache_percent = free_percent;
-			}
-
-			/*
-			 * We only want to start kthread caching if we are at
-			 * the point where we will wait for caching to make
-			 * progress, or if our ideal search is over and we've
-			 * found somebody to start caching.
-			 */
-			if (loop > LOOP_CACHING_NOWAIT ||
-			    (loop > LOOP_FIND_IDEAL &&
-			     atomic_read(&space_info->caching_threads) < 2)) {
-				ret = cache_block_group(block_group, trans,
-							orig_root, 0);
-				BUG_ON(ret);
-			}
-			found_uncached_bg = true;
-
-			/*
-			 * If loop is set for cached only, try the next block
-			 * group.
-			 */
-			if (loop == LOOP_FIND_IDEAL)
-				goto loop;
-		}
-
 		cached = block_group_cache_done(block_group);
-		if (unlikely(!cached))
-			found_uncached_bg = true;
+		if (unlikely(!cached)) {
+			ret = cache_block_group(block_group, 0);
+			BUG_ON(ret < 0);
+			ret = 0;
+		}
 
+		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+			goto loop;
 		if (unlikely(block_group->ro))
 			goto loop;
 
 		/*
-		 * Ok we want to try and use the cluster allocator, so lets look
-		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
-		 * have tried the cluster allocator plenty of times at this
-		 * point and not have found anything, so we are likely way too
-		 * fragmented for the clustering stuff to find anything, so lets
-		 * just skip it and let the allocator find whatever block it can
-		 * find
+		 * Ok we want to try and use the cluster allocator, so
+		 * lets look there
 		 */
-		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+		if (last_ptr) {
+			struct btrfs_block_group_cache *used_block_group;
+			unsigned long aligned_cluster;
 			/*
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
 			 */
-			spin_lock(&last_ptr->refill_lock);
-			if (last_ptr->block_group &&
-			    (last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data))) {
-				offset = 0;
+			used_block_group = btrfs_lock_cluster(block_group,
+							      last_ptr,
+							      delalloc);
+			if (!used_block_group)
 				goto refill_cluster;
-			}
 
-			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
-						 num_bytes, search_start);
+			if (used_block_group != block_group &&
+			    (used_block_group->ro ||
+			     !block_group_bits(used_block_group, flags)))
+				goto release_cluster;
+
+			offset = btrfs_alloc_from_cluster(used_block_group,
+						last_ptr,
+						num_bytes,
+						used_block_group->key.objectid,
+						&max_extent_size);
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
+				trace_btrfs_reserve_extent_cluster(root,
+						used_block_group,
+						search_start, num_bytes);
+				if (used_block_group != block_group) {
+					btrfs_release_block_group(block_group,
+								  delalloc);
+					block_group = used_block_group;
+				}
 				goto checks;
 			}
 
-			spin_lock(&last_ptr->lock);
-			/*
-			 * whoops, this cluster doesn't actually point to
-			 * this block group.  Get a ref on the block
-			 * group is does point to and try again
-			 */
-			if (!last_ptr_loop && last_ptr->block_group &&
-			    last_ptr->block_group != block_group) {
-
-				btrfs_put_block_group(block_group);
-				block_group = last_ptr->block_group;
-				btrfs_get_block_group(block_group);
-				spin_unlock(&last_ptr->lock);
+			WARN_ON(last_ptr->block_group != used_block_group);
+release_cluster:
+			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
+			 * set up a new clusters, so lets just skip it
+			 * and let the allocator find whatever block
+			 * it can find.  If we reach this point, we
+			 * will have tried the cluster allocator
+			 * plenty of times and not have found
+			 * anything, so we are likely way too
+			 * fragmented for the clustering stuff to find
+			 * anything.
+			 *
+			 * However, if the cluster is taken from the
+			 * current block group, release the cluster
+			 * first, so that we stand a better chance of
+			 * succeeding in the unclustered
+			 * allocation.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE &&
+			    used_block_group != block_group) {
 				spin_unlock(&last_ptr->refill_lock);
-
-				last_ptr_loop = 1;
-				search_start = block_group->key.objectid;
-				/*
-				 * we know this block group is properly
-				 * in the list because
-				 * btrfs_remove_block_group, drops the
-				 * cluster before it removes the block
-				 * group from the list
-				 */
-				goto have_block_group;
+				btrfs_release_block_group(used_block_group,
+							  delalloc);
+				goto unclustered_alloc;
 			}
-			spin_unlock(&last_ptr->lock);
-refill_cluster:
+
 			/*
 			 * this cluster didn't work out, free it and
 			 * start over
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
-			last_ptr_loop = 0;
+			if (used_block_group != block_group)
+				btrfs_release_block_group(used_block_group,
+							  delalloc);
+refill_cluster:
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
+			aligned_cluster = max_t(unsigned long,
+						empty_cluster + empty_size,
+					      block_group->full_stripe_len);
 
 			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(trans, root,
-					       block_group, last_ptr,
-					       offset, num_bytes,
-					       empty_cluster + empty_size);
+			ret = btrfs_find_space_cluster(root, block_group,
+						       last_ptr, search_start,
+						       num_bytes,
+						       aligned_cluster);
 			if (ret == 0) {
 				/*
 				 * now pull our allocation out of this
 				 * cluster
 				 */
 				offset = btrfs_alloc_from_cluster(block_group,
-						  last_ptr, num_bytes,
-						  search_start);
+							last_ptr,
+							num_bytes,
+							search_start,
+							&max_extent_size);
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
+					trace_btrfs_reserve_extent_cluster(root,
+						block_group, search_start,
+						num_bytes);
 					goto checks;
 				}
 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5148,8 +6718,23 @@ refill_cluster:
 			goto loop;
 		}
 
+unclustered_alloc:
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_cluster + empty_size) {
+			if (block_group->free_space_ctl->free_space >
+			    max_extent_size)
+				max_extent_size =
+					block_group->free_space_ctl->free_space;
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
-						    num_bytes, empty_size);
+						    num_bytes, empty_size,
+						    &max_extent_size);
 		/*
 		 * If we didn't find a chunk, and we haven't failed on this
 		 * block group before, and this block group is in the middle of
@@ -5166,15 +6751,13 @@ refill_cluster:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
+			if (!cached)
+				have_caching_bg = true;
 			goto loop;
 		}
 checks:
-		search_start = stripe_align(root, offset);
-		/* move on to the next group */
-		if (search_start + num_bytes >= search_end) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
-			goto loop;
-		}
+		search_start = stripe_align(root, block_group,
+					    offset, num_bytes);
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
@@ -5183,16 +6766,13 @@ checks:
 			goto loop;
 		}
 
-		ins->objectid = search_start;
-		ins->offset = num_bytes;
-
 		if (offset < search_start)
 			btrfs_add_free_space(block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = update_reserved_bytes(block_group, num_bytes, 1,
-					    (data & BTRFS_BLOCK_GROUP_DATA));
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+						  alloc_type, delalloc);
 		if (ret == -EAGAIN) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
@@ -5202,25 +6782,25 @@ checks:
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
-		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
-					     search_start - offset);
-		BUG_ON(offset > search_start);
+		trace_btrfs_reserve_extent(orig_root, block_group,
+					   search_start, num_bytes);
+		btrfs_release_block_group(block_group, delalloc);
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
 		BUG_ON(index != get_block_group_index(block_group));
-		btrfs_put_block_group(block_group);
+		btrfs_release_block_group(block_group, delalloc);
 	}
 	up_read(&space_info->groups_sem);
 
+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+		goto search;
+
 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 		goto search;
 
-	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
-	 *			for them to make caching progress.  Also
-	 *			determine the best possible bg to cache
+	/*
 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
 	 *			caching kthreads as we move along
 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5228,85 +6808,55 @@ loop:
 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
 	 *			again
 	 */
-	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-	    (found_uncached_bg || empty_size || empty_cluster ||
-	     allowed_chunk_alloc)) {
+	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
 		index = 0;
-		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
-			found_uncached_bg = false;
-			loop++;
-			if (!ideal_cache_percent &&
-			    atomic_read(&space_info->caching_threads))
-				goto search;
+		loop++;
+		if (loop == LOOP_ALLOC_CHUNK) {
+			struct btrfs_trans_handle *trans;
+			int exist = 0;
 
+			trans = current->journal_info;
+			if (trans)
+				exist = 1;
+			else
+				trans = btrfs_join_transaction(root);
+
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
+			ret = do_chunk_alloc(trans, root, flags,
+					     CHUNK_ALLOC_FORCE);
 			/*
-			 * 1 of the following 2 things have happened so far
-			 *
-			 * 1) We found an ideal block group for caching that
-			 * is mostly full and will cache quickly, so we might
-			 * as well wait for it.
-			 *
-			 * 2) We searched for cached only and we didn't find
-			 * anything, and we didn't start any caching kthreads
-			 * either, so chances are we will loop through and
-			 * start a couple caching kthreads, and then come back
-			 * around and just wait for them.  This will be slower
-			 * because we will have 2 caching kthreads reading at
-			 * the same time when we could have just started one
-			 * and waited for it to get far enough to give us an
-			 * allocation, so go ahead and go to the wait caching
-			 * loop.
-			 */
-			loop = LOOP_CACHING_WAIT;
-			search_start = ideal_cache_offset;
-			ideal_cache_percent = 0;
-			goto ideal_cache;
-		} else if (loop == LOOP_FIND_IDEAL) {
-			/*
-			 * Didn't find a uncached bg, wait on anything we find
-			 * next.
+			 * Do not bail out on ENOSPC since we
+			 * can do more things.
 			 */
-			loop = LOOP_CACHING_WAIT;
-			goto search;
-		}
-
-		if (loop < LOOP_CACHING_WAIT) {
-			loop++;
-			goto search;
+			if (ret < 0 && ret != -ENOSPC)
+				btrfs_abort_transaction(trans,
+							root, ret);
+			else
+				ret = 0;
+			if (!exist)
+				btrfs_end_transaction(trans, root);
+			if (ret)
+				goto out;
 		}
 
-		if (loop == LOOP_ALLOC_CHUNK) {
+		if (loop == LOOP_NO_EMPTY_SIZE) {
 			empty_size = 0;
 			empty_cluster = 0;
 		}
 
-		if (allowed_chunk_alloc) {
-			ret = do_chunk_alloc(trans, root, num_bytes +
-					     2 * 1024 * 1024, data, 1);
-			allowed_chunk_alloc = 0;
-			done_chunk_alloc = 1;
-		} else if (!done_chunk_alloc) {
-			space_info->force_alloc = 1;
-		}
-
-		if (loop < LOOP_NO_EMPTY_SIZE) {
-			loop++;
-			goto search;
-		}
-		ret = -ENOSPC;
+		goto search;
 	} else if (!ins->objectid) {
 		ret = -ENOSPC;
-	}
-
-	/* we found what we needed */
-	if (ins->objectid) {
-		if (!(data & BTRFS_BLOCK_GROUP_DATA))
-			trans->block_group = block_group->key.objectid;
-
-		btrfs_put_block_group(block_group);
+	} else if (ins->objectid) {
 		ret = 0;
 	}
-
+out:
+	if (ret == -ENOSPC)
+		ins->offset = max_extent_size;
 	return ret;
 }
 
@@ -5317,19 +6867,16 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 	int index = 0;
 
 	spin_lock(&info->lock);
-	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
-	       (unsigned long long)(info->total_bytes - info->bytes_used -
-				    info->bytes_pinned - info->bytes_reserved -
-				    info->bytes_readonly),
+	printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
+	       info->flags,
+	       info->total_bytes - info->bytes_used - info->bytes_pinned -
+	       info->bytes_reserved - info->bytes_readonly,
 	       (info->full) ? "" : "not ");
-	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+	printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
-	       (unsigned long long)info->total_bytes,
-	       (unsigned long long)info->bytes_used,
-	       (unsigned long long)info->bytes_pinned,
-	       (unsigned long long)info->bytes_reserved,
-	       (unsigned long long)info->bytes_may_use,
-	       (unsigned long long)info->bytes_readonly);
+	       info->total_bytes, info->bytes_used, info->bytes_pinned,
+	       info->bytes_reserved, info->bytes_may_use,
+	       info->bytes_readonly);
 	spin_unlock(&info->lock);
 
 	if (!dump_block_groups)
@@ -5339,13 +6886,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
 	list_for_each_entry(cache, &info->block_groups[index], list) {
 		spin_lock(&cache->lock);
-		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
-		       "%llu pinned %llu reserved\n",
-		       (unsigned long long)cache->key.objectid,
-		       (unsigned long long)cache->key.offset,
-		       (unsigned long long)btrfs_block_group_used(&cache->item),
-		       (unsigned long long)cache->pinned,
-		       (unsigned long long)cache->reserved);
+		printk(KERN_INFO "BTRFS: "
+			   "block group %llu has %llu bytes, "
+			   "%llu used %llu pinned %llu reserved %s\n",
+		       cache->key.objectid, cache->key.offset,
+		       btrfs_block_group_used(&cache->item), cache->pinned,
+		       cache->reserved, cache->ro ? "[readonly]" : "");
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
@@ -5354,73 +6900,85 @@ again:
 	up_read(&info->groups_sem);
 }
 
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,
-			 u64 search_end, struct btrfs_key *ins,
-			 u64 data)
+			 struct btrfs_key *ins, int is_data, int delalloc)
 {
+	bool final_tried = false;
+	u64 flags;
 	int ret;
-	u64 search_start = 0;
 
-	data = btrfs_get_alloc_profile(root, data);
+	flags = btrfs_get_alloc_profile(root, is_data);
 again:
-	/*
-	 * the only place that sets empty_size is btrfs_realloc_node, which
-	 * is not called recursively on allocations
-	 */
-	if (empty_size || root->ref_cows)
-		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data, 0);
-
 	WARN_ON(num_bytes < root->sectorsize);
-	ret = find_free_extent(trans, root, num_bytes, empty_size,
-			       search_start, search_end, hint_byte,
-			       ins, data);
-
-	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
-		num_bytes = num_bytes >> 1;
-		num_bytes = num_bytes & ~(root->sectorsize - 1);
-		num_bytes = max(num_bytes, min_alloc_size);
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       num_bytes, data, 1);
-		goto again;
-	}
-	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
-		struct btrfs_space_info *sinfo;
+	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+			       flags, delalloc);
+
+	if (ret == -ENOSPC) {
+		if (!final_tried && ins->offset) {
+			num_bytes = min(num_bytes >> 1, ins->offset);
+			num_bytes = round_down(num_bytes, root->sectorsize);
+			num_bytes = max(num_bytes, min_alloc_size);
+			if (num_bytes == min_alloc_size)
+				final_tried = true;
+			goto again;
+		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+			struct btrfs_space_info *sinfo;
 
-		sinfo = __find_space_info(root->fs_info, data);
-		printk(KERN_ERR "btrfs allocation failed flags %llu, "
-		       "wanted %llu\n", (unsigned long long)data,
-		       (unsigned long long)num_bytes);
-		dump_space_info(sinfo, num_bytes, 1);
+			sinfo = __find_space_info(root->fs_info, flags);
+			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
+				flags, num_bytes);
+			if (sinfo)
+				dump_space_info(sinfo, num_bytes, 1);
+		}
 	}
 
 	return ret;
 }
 
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len,
+					int pin, int delalloc)
 {
 	struct btrfs_block_group_cache *cache;
 	int ret = 0;
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %llu\n",
-		       (unsigned long long)start);
+		btrfs_err(root->fs_info, "Unable to find block group for %llu",
+			start);
 		return -ENOSPC;
 	}
 
-	ret = btrfs_discard_extent(root, start, len);
+	if (btrfs_test_opt(root, DISCARD))
+		ret = btrfs_discard_extent(root, start, len, NULL);
 
-	btrfs_add_free_space(cache, start, len);
-	update_reserved_bytes(cache, len, 0, 1);
+	if (pin)
+		pin_down_extent(root, cache, start, len, 1);
+	else {
+		btrfs_add_free_space(cache, start, len);
+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+	}
 	btrfs_put_block_group(cache);
 
+	trace_btrfs_reserved_extent_free(root, start, len);
+
 	return ret;
 }
 
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+			       u64 start, u64 len, int delalloc)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 1, 0);
+}
+
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      u64 parent, u64 root_objectid,
@@ -5444,12 +7002,16 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_free_path(path);
+		return ret;
+	}
 
 	leaf = path->nodes[0];
 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5478,13 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-	if (ret) {
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
+	/* Always set parent to 0 here since its exclusive anyway. */
+	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+				      ins->objectid, ins->offset,
+				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	if (ret)
+		return ret;
+
+	ret = update_block_group(root, ins->objectid, ins->offset, 1);
+	if (ret) { /* -ENOENT, logic error */
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			ins->objectid, ins->offset);
 		BUG();
 	}
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
 	return ret;
 }
 
@@ -5492,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 parent, u64 root_objectid,
 				     u64 flags, struct btrfs_disk_key *key,
-				     int level, struct btrfs_key *ins)
+				     int level, struct btrfs_key *ins,
+				     int no_quota)
 {
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5501,15 +7071,30 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+	u32 size = sizeof(*extent_item) + sizeof(*iref);
+	u64 num_bytes = ins->offset;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (!skinny_metadata)
+		size += sizeof(*block_info);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->leafsize);
+		return -ENOMEM;
+	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->leafsize);
+		btrfs_free_path(path);
+		return ret;
+	}
 
 	leaf = path->nodes[0];
 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5518,12 +7103,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 	btrfs_set_extent_flags(leaf, extent_item,
 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
-	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
 
-	btrfs_set_tree_block_key(leaf, block_info, key);
-	btrfs_set_tree_block_level(leaf, block_info, level);
+	if (skinny_metadata) {
+		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+		num_bytes = root->leafsize;
+	} else {
+		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+		btrfs_set_tree_block_key(leaf, block_info, key);
+		btrfs_set_tree_block_level(leaf, block_info, level);
+		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	}
 
-	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 	if (parent > 0) {
 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 		btrfs_set_extent_inline_ref_type(leaf, iref,
@@ -5538,13 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-	if (ret) {
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
+	if (!no_quota) {
+		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+					      ins->objectid, num_bytes,
+					      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+		if (ret)
+			return ret;
+	}
+
+	ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+	if (ret) { /* -ENOENT, logic error */
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			ins->objectid, ins->offset);
 		BUG();
 	}
+
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
 	return ret;
 }
 
@@ -5557,9 +7156,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-					 0, root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+					 ins->offset, 0,
+					 root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
 	return ret;
 }
 
@@ -5575,57 +7175,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
-	struct btrfs_caching_control *caching_ctl;
-	u64 start = ins->objectid;
-	u64 num_bytes = ins->offset;
-
-	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	cache_block_group(block_group, trans, NULL, 0);
-	caching_ctl = get_caching_control(block_group);
 
-	if (!caching_ctl) {
-		BUG_ON(!block_group_cache_done(block_group));
-		ret = btrfs_remove_free_space(block_group, start, num_bytes);
-		BUG_ON(ret);
-	} else {
-		mutex_lock(&caching_ctl->mutex);
-
-		if (start >= caching_ctl->progress) {
-			ret = add_excluded_extent(root, start, num_bytes);
-			BUG_ON(ret);
-		} else if (start + num_bytes <= caching_ctl->progress) {
-			ret = btrfs_remove_free_space(block_group,
-						      start, num_bytes);
-			BUG_ON(ret);
-		} else {
-			num_bytes = caching_ctl->progress - start;
-			ret = btrfs_remove_free_space(block_group,
-						      start, num_bytes);
-			BUG_ON(ret);
-
-			start = caching_ctl->progress;
-			num_bytes = ins->objectid + ins->offset -
-				    caching_ctl->progress;
-			ret = add_excluded_extent(root, start, num_bytes);
-			BUG_ON(ret);
-		}
-
-		mutex_unlock(&caching_ctl->mutex);
-		put_caching_control(caching_ctl);
+	/*
+	 * Mixed block groups will exclude before processing the log so we only
+	 * need to do the exlude dance if this fs isn't mixed.
+	 */
+	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+		if (ret)
+			return ret;
 	}
 
-	ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
-	BUG_ON(ret);
-	btrfs_put_block_group(block_group);
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	if (!block_group)
+		return -EINVAL;
+
+	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+					  RESERVE_ALLOC_NO_ACCOUNT, 0);
+	BUG_ON(ret); /* logic error */
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
+	btrfs_put_block_group(block_group);
 	return ret;
 }
 
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize,
-					    int level)
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u32 blocksize, int level)
 {
 	struct extent_buffer *buf;
 
@@ -5633,9 +7209,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 	btrfs_set_header_generation(buf, trans->transid);
-	btrfs_set_buffer_lockdep_class(buf, level);
+	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
+	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
 	btrfs_set_lock_blocking(buf);
 	btrfs_set_buffer_uptodate(buf);
@@ -5667,53 +7244,58 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	int ret;
+	bool global_updated = false;
 
 	block_rsv = get_block_rsv(trans, root);
 
-	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     blocksize, 0);
-		/*
-		 * If we couldn't reserve metadata bytes try and use some from
-		 * the global reserve.
-		 */
-		if (ret && block_rsv != global_rsv) {
-			ret = block_rsv_use_bytes(global_rsv, blocksize);
-			if (!ret)
-				return global_rsv;
-			return ERR_PTR(ret);
-		} else if (ret) {
-			return ERR_PTR(ret);
-		}
+	if (unlikely(block_rsv->size == 0))
+		goto try_reserve;
+again:
+	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (!ret)
 		return block_rsv;
+
+	if (block_rsv->failfast)
+		return ERR_PTR(ret);
+
+	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+		global_updated = true;
+		update_global_block_rsv(root->fs_info);
+		goto again;
 	}
 
-	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL * 10,
+				/*DEFAULT_RATELIMIT_BURST*/ 1);
+		if (__ratelimit(&_rs))
+			WARN(1, KERN_DEBUG
+				"BTRFS: block rsv returned %d\n", ret);
+	}
+try_reserve:
+	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+				     BTRFS_RESERVE_NO_FLUSH);
 	if (!ret)
 		return block_rsv;
-	if (ret) {
-		WARN_ON(1);
-		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
-					     0);
-		if (!ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->size += blocksize;
-			spin_unlock(&block_rsv->lock);
-			return block_rsv;
-		} else if (ret && block_rsv != global_rsv) {
-			ret = block_rsv_use_bytes(global_rsv, blocksize);
-			if (!ret)
-				return global_rsv;
-		}
+	/*
+	 * If we couldn't reserve metadata bytes try and use some from
+	 * the global reserve if its space type is the same as the global
+	 * reservation.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+	    block_rsv->space_info == global_rsv->space_info) {
+		ret = block_rsv_use_bytes(global_rsv, blocksize);
+		if (!ret)
+			return global_rsv;
 	}
-
-	return ERR_PTR(-ENOSPC);
+	return ERR_PTR(ret);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(block_rsv, NULL, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -5734,22 +7316,32 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *buf;
 	u64 flags = 0;
 	int ret;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
-
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+					    blocksize, level);
+		if (!IS_ERR(buf))
+			root->alloc_bytenr += blocksize;
+		return buf;
+	}
+#endif
 	block_rsv = use_block_rsv(trans, root, blocksize);
 	if (IS_ERR(block_rsv))
 		return ERR_CAST(block_rsv);
 
-	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
-				   empty_size, hint, (u64)-1, &ins, 0);
+	ret = btrfs_reserve_extent(root, blocksize, blocksize,
+				   empty_size, hint, &ins, 0, 0);
 	if (ret) {
-		unuse_block_rsv(block_rsv, blocksize);
+		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
 		return ERR_PTR(ret);
 	}
 
 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
 				    blocksize, level);
-	BUG_ON(IS_ERR(buf));
+	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
 
 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 		if (parent == 0)
@@ -5760,22 +7352,27 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		struct btrfs_delayed_extent_op *extent_op;
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
+		extent_op = btrfs_alloc_delayed_extent_op();
+		BUG_ON(!extent_op); /* -ENOMEM */
 		if (key)
 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
 		else
 			memset(&extent_op->key, 0, sizeof(extent_op->key));
 		extent_op->flags_to_set = flags;
-		extent_op->update_key = 1;
+		if (skinny_metadata)
+			extent_op->update_key = 0;
+		else
+			extent_op->update_key = 1;
 		extent_op->update_flags = 1;
 		extent_op->is_data = 0;
+		extent_op->level = level;
 
-		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					ins.objectid,
 					ins.offset, parent, root_objectid,
 					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op);
-		BUG_ON(ret);
+					extent_op, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	return buf;
 }
@@ -5791,6 +7388,7 @@ struct walk_control {
 	int keep_locks;
 	int reada_slot;
 	int reada_count;
+	int for_reloc;
 };
 
 #define DROP_REFERENCE	1
@@ -5842,9 +7440,12 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 			continue;
 
 		/* We don't lock the tree block, it's OK to be racy here */
-		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-					       &refs, &flags);
-		BUG_ON(ret);
+		ret = btrfs_lookup_extent_info(trans, root, bytenr,
+					       wc->level - 1, 1, &refs,
+					       &flags);
+		/* We don't care about errors in readahead. */
+		if (ret < 0)
+			continue;
 		BUG_ON(refs == 0);
 
 		if (wc->stage == DROP_REFERENCE) {
@@ -5878,7 +7479,7 @@ reada:
 }
 
 /*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
  *
  * when wc->stage == UPDATE_BACKREF, this function updates
  * back refs for pointers in the block.
@@ -5908,10 +7509,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
 		BUG_ON(!path->locks[level]);
 		ret = btrfs_lookup_extent_info(trans, root,
-					       eb->start, eb->len,
+					       eb->start, level, 1,
 					       &wc->refs[level],
 					       &wc->flags[level]);
-		BUG_ON(ret);
+		BUG_ON(ret == -ENOMEM);
+		if (ret)
+			return ret;
 		BUG_ON(wc->refs[level] == 0);
 	}
 
@@ -5920,7 +7523,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 			return 1;
 
 		if (path->locks[level] && !wc->keep_locks) {
-			btrfs_tree_unlock(eb);
+			btrfs_tree_unlock_rw(eb, path->locks[level]);
 			path->locks[level] = 0;
 		}
 		return 0;
@@ -5929,13 +7532,14 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	/* wc->stage == UPDATE_BACKREF */
 	if (!(wc->flags[level] & flag)) {
 		BUG_ON(!path->locks[level]);
-		ret = btrfs_inc_ref(trans, root, eb, 1);
-		BUG_ON(ret);
-		ret = btrfs_dec_ref(trans, root, eb, 0);
-		BUG_ON(ret);
+		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+		BUG_ON(ret); /* -ENOMEM */
+		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+		BUG_ON(ret); /* -ENOMEM */
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
-						  eb->len, flag, 0);
-		BUG_ON(ret);
+						  eb->len, flag,
+						  btrfs_header_level(eb), 0);
+		BUG_ON(ret); /* -ENOMEM */
 		wc->flags[level] |= flag;
 	}
 
@@ -5944,14 +7548,14 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	 * keep the tree lock
 	 */
 	if (path->locks[level] && level > 0) {
-		btrfs_tree_unlock(eb);
+		btrfs_tree_unlock_rw(eb, path->locks[level]);
 		path->locks[level] = 0;
 	}
 	return 0;
 }
 
 /*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
  *
  * when wc->stage == DROP_REFERENCE, this function checks
  * reference count of the block pointed to. if the block
@@ -5999,16 +7603,25 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
 		if (!next)
 			return -ENOMEM;
+		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+					       level - 1);
 		reada = 1;
 	}
 	btrfs_tree_lock(next);
 	btrfs_set_lock_blocking(next);
 
-	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
 				       &wc->refs[level - 1],
 				       &wc->flags[level - 1]);
-	BUG_ON(ret);
-	BUG_ON(wc->refs[level - 1] == 0);
+	if (ret < 0) {
+		btrfs_tree_unlock(next);
+		return ret;
+	}
+
+	if (unlikely(wc->refs[level - 1] == 0)) {
+		btrfs_err(root->fs_info, "Missing references.");
+		BUG();
+	}
 	*lookup_info = 0;
 
 	if (wc->stage == DROP_REFERENCE) {
@@ -6036,7 +7649,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 			goto skip;
 	}
 
-	if (!btrfs_buffer_uptodate(next, generation)) {
+	if (!btrfs_buffer_uptodate(next, generation, 0)) {
 		btrfs_tree_unlock(next);
 		free_extent_buffer(next);
 		next = NULL;
@@ -6047,6 +7660,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
 		next = read_tree_block(root, bytenr, blocksize, generation);
+		if (!next || !extent_buffer_uptodate(next)) {
+			free_extent_buffer(next);
+			return -EIO;
+		}
 		btrfs_tree_lock(next);
 		btrfs_set_lock_blocking(next);
 	}
@@ -6055,7 +7672,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	BUG_ON(level != btrfs_header_level(next));
 	path->nodes[level] = next;
 	path->slots[level] = 0;
-	path->locks[level] = 1;
+	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 	wc->level = level;
 	if (wc->level == 1)
 		wc->reada_slot = 0;
@@ -6073,8 +7690,8 @@ skip:
 		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-					root->root_key.objectid, level - 1, 0);
-		BUG_ON(ret);
+				root->root_key.objectid, level - 1, 0, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	btrfs_tree_unlock(next);
 	free_extent_buffer(next);
@@ -6083,7 +7700,7 @@ skip:
 }
 
 /*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
  *
  * when wc->stage == DROP_REFERENCE, this function drops
  * reference count on the block.
@@ -6126,16 +7743,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			BUG_ON(level == 0);
 			btrfs_tree_lock(eb);
 			btrfs_set_lock_blocking(eb);
-			path->locks[level] = 1;
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, root,
-						       eb->start, eb->len,
+						       eb->start, level, 1,
 						       &wc->refs[level],
 						       &wc->flags[level]);
-			BUG_ON(ret);
+			if (ret < 0) {
+				btrfs_tree_unlock_rw(eb, path->locks[level]);
+				path->locks[level] = 0;
+				return ret;
+			}
 			BUG_ON(wc->refs[level] == 0);
 			if (wc->refs[level] == 1) {
-				btrfs_tree_unlock(eb);
+				btrfs_tree_unlock_rw(eb, path->locks[level]);
 				path->locks[level] = 0;
 				return 1;
 			}
@@ -6148,17 +7769,19 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 	if (wc->refs[level] == 1) {
 		if (level == 0) {
 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-				ret = btrfs_dec_ref(trans, root, eb, 1);
+				ret = btrfs_dec_ref(trans, root, eb, 1,
+						    wc->for_reloc);
 			else
-				ret = btrfs_dec_ref(trans, root, eb, 0);
-			BUG_ON(ret);
+				ret = btrfs_dec_ref(trans, root, eb, 0,
+						    wc->for_reloc);
+			BUG_ON(ret); /* -ENOMEM */
 		}
 		/* make block locked assertion in clean_tree_block happy */
 		if (!path->locks[level] &&
 		    btrfs_header_generation(eb) == trans->transid) {
 			btrfs_tree_lock(eb);
 			btrfs_set_lock_blocking(eb);
-			path->locks[level] = 1;
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 		}
 		clean_tree_block(trans, root, eb);
 	}
@@ -6237,7 +7860,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				return 0;
 
 			if (path->locks[level]) {
-				btrfs_tree_unlock(path->nodes[level]);
+				btrfs_tree_unlock_rw(path->nodes[level],
+						     path->locks[level]);
 				path->locks[level] = 0;
 			}
 			free_extent_buffer(path->nodes[level]);
@@ -6258,9 +7882,12 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * reference count by one. if update_ref is true, this function
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
  */
 int btrfs_drop_snapshot(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv, int update_ref)
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc)
 {
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans;
@@ -6271,15 +7898,26 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	int err = 0;
 	int ret;
 	int level;
+	bool root_dropped = false;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
-	BUG_ON(!wc);
+	if (!wc) {
+		btrfs_free_path(path);
+		err = -ENOMEM;
+		goto out;
+	}
 
 	trans = btrfs_start_transaction(tree_root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
 
 	if (block_rsv)
 		trans->block_rsv = block_rsv;
@@ -6289,7 +7927,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 		path->nodes[level] = btrfs_lock_root_node(root);
 		btrfs_set_lock_blocking(path->nodes[level]);
 		path->slots[level] = 0;
-		path->locks[level] = 1;
+		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 		memset(&wc->update_progress, 0,
 		       sizeof(wc->update_progress));
 	} else {
@@ -6304,7 +7942,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 		path->lowest_level = 0;
 		if (ret < 0) {
 			err = ret;
-			goto out;
+			goto out_end_trans;
 		}
 		WARN_ON(ret > 0);
 
@@ -6318,19 +7956,23 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 		while (1) {
 			btrfs_tree_lock(path->nodes[level]);
 			btrfs_set_lock_blocking(path->nodes[level]);
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, root,
 						path->nodes[level]->start,
-						path->nodes[level]->len,
-						&wc->refs[level],
+						level, 1, &wc->refs[level],
 						&wc->flags[level]);
-			BUG_ON(ret);
+			if (ret < 0) {
+				err = ret;
+				goto out_end_trans;
+			}
 			BUG_ON(wc->refs[level] == 0);
 
 			if (level == root_item->drop_level)
 				break;
 
 			btrfs_tree_unlock(path->nodes[level]);
+			path->locks[level] = 0;
 			WARN_ON(wc->refs[level] != 1);
 			level--;
 		}
@@ -6341,9 +7983,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = update_ref;
 	wc->keep_locks = 0;
+	wc->for_reloc = for_reloc;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
+
 		ret = walk_down_tree(trans, root, path, wc);
 		if (ret < 0) {
 			err = ret;
@@ -6370,30 +8014,51 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 		}
 
 		BUG_ON(wc->level == 0);
-		if (btrfs_should_end_transaction(trans, tree_root)) {
+		if (btrfs_should_end_transaction(trans, tree_root) ||
+		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
 			ret = btrfs_update_root(trans, tree_root,
 						&root->root_key,
 						root_item);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, tree_root, ret);
+				err = ret;
+				goto out_end_trans;
+			}
 
 			btrfs_end_transaction_throttle(trans, tree_root);
+			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+				pr_debug("BTRFS: drop snapshot early exit\n");
+				err = -EAGAIN;
+				goto out_free;
+			}
+
 			trans = btrfs_start_transaction(tree_root, 0);
-			BUG_ON(IS_ERR(trans));
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				goto out_free;
+			}
 			if (block_rsv)
 				trans->block_rsv = block_rsv;
 		}
 	}
-	btrfs_release_path(root, path);
-	BUG_ON(err);
+	btrfs_release_path(path);
+	if (err)
+		goto out_end_trans;
 
 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, tree_root, ret);
+		goto out_end_trans;
+	}
 
 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
-					   NULL, NULL);
-		BUG_ON(ret < 0);
-		if (ret > 0) {
+		ret = btrfs_find_root(tree_root, &root->root_key, path,
+				      NULL, NULL);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, tree_root, ret);
+			err = ret;
+			goto out_end_trans;
+		} else if (ret > 0) {
 			/* if we fail to delete the orphan item this time
 			 * around, it'll get picked up the next time.
 			 *
@@ -6404,17 +8069,31 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 		}
 	}
 
-	if (root->in_radix) {
-		btrfs_free_fs_root(tree_root->fs_info, root);
+	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
 	} else {
 		free_extent_buffer(root->node);
 		free_extent_buffer(root->commit_root);
-		kfree(root);
+		btrfs_put_fs_root(root);
 	}
-out:
+	root_dropped = true;
+out_end_trans:
 	btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
 	kfree(wc);
 	btrfs_free_path(path);
+out:
+	/*
+	 * So if we need to stop dropping the snapshot for whatever reason we
+	 * need to make sure to add it back to the dead root list so that we
+	 * keep trying to do the work later.  This also cleans up roots if we
+	 * don't have it in the radix (like when we recover after a power fail
+	 * or unmount) so we don't leak memory.
+	 */
+	if (!for_reloc && root_dropped == false)
+		btrfs_add_dead_root(root);
+	if (err && err != -EAGAIN)
+		btrfs_std_error(root->fs_info, err);
 	return err;
 }
 
@@ -6422,6 +8101,7 @@ out:
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
@@ -6438,10 +8118,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
-	BUG_ON(!wc);
+	if (!wc) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
 
 	btrfs_assert_tree_locked(parent);
 	parent_level = btrfs_header_level(parent);
@@ -6453,7 +8137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(node);
 	path->nodes[level] = node;
 	path->slots[level] = 0;
-	path->locks[level] = 1;
+	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 	wc->refs[parent_level] = 1;
 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6462,6 +8146,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = 0;
 	wc->keep_locks = 1;
+	wc->for_reloc = 1;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
@@ -6483,1500 +8168,18 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-#if 0
-static unsigned long calc_ra(unsigned long start, unsigned long last,
-			     unsigned long nr)
-{
-	return min(last, start + nr - 1);
-}
-
-static noinline int relocate_inode_pages(struct inode *inode, u64 start,
-					 u64 len)
-{
-	u64 page_start;
-	u64 page_end;
-	unsigned long first_index;
-	unsigned long last_index;
-	unsigned long i;
-	struct page *page;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct file_ra_state *ra;
-	struct btrfs_ordered_extent *ordered;
-	unsigned int total_read = 0;
-	unsigned int total_dirty = 0;
-	int ret = 0;
-
-	ra = kzalloc(sizeof(*ra), GFP_NOFS);
-	if (!ra)
-		return -ENOMEM;
-
-	mutex_lock(&inode->i_mutex);
-	first_index = start >> PAGE_CACHE_SHIFT;
-	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
-
-	/* make sure the dirty trick played by the caller work */
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    first_index, last_index);
-	if (ret)
-		goto out_unlock;
-
-	file_ra_state_init(ra, inode->i_mapping);
-
-	for (i = first_index ; i <= last_index; i++) {
-		if (total_read % ra->ra_pages == 0) {
-			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-				       calc_ra(i, last_index, ra->ra_pages));
-		}
-		total_read++;
-again:
-		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-			BUG_ON(1);
-		page = grab_cache_page(inode->i_mapping, i);
-		if (!page) {
-			ret = -ENOMEM;
-			goto out_unlock;
-		}
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				ret = -EIO;
-				goto out_unlock;
-			}
-		}
-		wait_on_page_writeback(page);
-
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-		page_end = page_start + PAGE_CACHE_SIZE - 1;
-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
-		if (ordered) {
-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-			unlock_page(page);
-			page_cache_release(page);
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			goto again;
-		}
-		set_page_extent_mapped(page);
-
-		if (i == first_index)
-			set_extent_bits(io_tree, page_start, page_end,
-					EXTENT_BOUNDARY, GFP_NOFS);
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
-
-		set_page_dirty(page);
-		total_dirty++;
-
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		unlock_page(page);
-		page_cache_release(page);
-	}
-
-out_unlock:
-	kfree(ra);
-	mutex_unlock(&inode->i_mutex);
-	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
-	return ret;
-}
-
-static noinline int relocate_data_extent(struct inode *reloc_inode,
-					 struct btrfs_key *extent_key,
-					 u64 offset)
-{
-	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
-	struct extent_map *em;
-	u64 start = extent_key->objectid - offset;
-	u64 end = start + extent_key->offset - 1;
-
-	em = alloc_extent_map(GFP_NOFS);
-	BUG_ON(!em);
-
-	em->start = start;
-	em->len = extent_key->offset;
-	em->block_len = extent_key->offset;
-	em->block_start = extent_key->objectid;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
-	/* setup extent map to cheat btrfs_readpage */
-	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-	while (1) {
-		int ret;
-		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		if (ret != -EEXIST) {
-			free_extent_map(em);
-			break;
-		}
-		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
-	}
-	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-
-	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
-}
-
-struct btrfs_ref_path {
-	u64 extent_start;
-	u64 nodes[BTRFS_MAX_LEVEL];
-	u64 root_objectid;
-	u64 root_generation;
-	u64 owner_objectid;
-	u32 num_refs;
-	int lowest_level;
-	int current_level;
-	int shared_level;
-
-	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
-	u64 new_nodes[BTRFS_MAX_LEVEL];
-};
-
-struct disk_extent {
-	u64 ram_bytes;
-	u64 disk_bytenr;
-	u64 disk_num_bytes;
-	u64 offset;
-	u64 num_bytes;
-	u8 compression;
-	u8 encryption;
-	u16 other_encoding;
-};
-
-static int is_cowonly_root(u64 root_objectid)
-{
-	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
-	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
-	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
-	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
-	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
-		return 1;
-	return 0;
-}
-
-static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *extent_root,
-				    struct btrfs_ref_path *ref_path,
-				    int first_time)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_path *path;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	u64 bytenr;
-	u32 nritems;
-	int level;
-	int ret = 1;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	if (first_time) {
-		ref_path->lowest_level = -1;
-		ref_path->current_level = -1;
-		ref_path->shared_level = -1;
-		goto walk_up;
-	}
-walk_down:
-	level = ref_path->current_level - 1;
-	while (level >= -1) {
-		u64 parent;
-		if (level < ref_path->lowest_level)
-			break;
-
-		if (level >= 0)
-			bytenr = ref_path->nodes[level];
-		else
-			bytenr = ref_path->extent_start;
-		BUG_ON(bytenr == 0);
-
-		parent = ref_path->nodes[level + 1];
-		ref_path->nodes[level + 1] = 0;
-		ref_path->current_level = level;
-		BUG_ON(parent == 0);
-
-		key.objectid = bytenr;
-		key.offset = parent + 1;
-		key.type = BTRFS_EXTENT_REF_KEY;
-
-		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
-		if (ret < 0)
-			goto out;
-		BUG_ON(ret == 0);
-
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret < 0)
-				goto out;
-			if (ret > 0)
-				goto next;
-			leaf = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid == bytenr &&
-		    found_key.type == BTRFS_EXTENT_REF_KEY) {
-			if (level < ref_path->shared_level)
-				ref_path->shared_level = level;
-			goto found;
-		}
-next:
-		level--;
-		btrfs_release_path(extent_root, path);
-		cond_resched();
-	}
-	/* reached lowest level */
-	ret = 1;
-	goto out;
-walk_up:
-	level = ref_path->current_level;
-	while (level < BTRFS_MAX_LEVEL - 1) {
-		u64 ref_objectid;
-
-		if (level >= 0)
-			bytenr = ref_path->nodes[level];
-		else
-			bytenr = ref_path->extent_start;
-
-		BUG_ON(bytenr == 0);
-
-		key.objectid = bytenr;
-		key.offset = 0;
-		key.type = BTRFS_EXTENT_REF_KEY;
-
-		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
-		if (ret < 0)
-			goto out;
-
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret < 0)
-				goto out;
-			if (ret > 0) {
-				/* the extent was freed by someone */
-				if (ref_path->lowest_level == level)
-					goto out;
-				btrfs_release_path(extent_root, path);
-				goto walk_down;
-			}
-			leaf = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != bytenr ||
-				found_key.type != BTRFS_EXTENT_REF_KEY) {
-			/* the extent was freed by someone */
-			if (ref_path->lowest_level == level) {
-				ret = 1;
-				goto out;
-			}
-			btrfs_release_path(extent_root, path);
-			goto walk_down;
-		}
-found:
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				struct btrfs_extent_ref);
-		ref_objectid = btrfs_ref_objectid(leaf, ref);
-		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-			if (first_time) {
-				level = (int)ref_objectid;
-				BUG_ON(level >= BTRFS_MAX_LEVEL);
-				ref_path->lowest_level = level;
-				ref_path->current_level = level;
-				ref_path->nodes[level] = bytenr;
-			} else {
-				WARN_ON(ref_objectid != level);
-			}
-		} else {
-			WARN_ON(level != -1);
-		}
-		first_time = 0;
-
-		if (ref_path->lowest_level == level) {
-			ref_path->owner_objectid = ref_objectid;
-			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
-		}
-
-		/*
-		 * the block is tree root or the block isn't in reference
-		 * counted tree.
-		 */
-		if (found_key.objectid == found_key.offset ||
-		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
-			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
-			ref_path->root_generation =
-				btrfs_ref_generation(leaf, ref);
-			if (level < 0) {
-				/* special reference from the tree log */
-				ref_path->nodes[0] = found_key.offset;
-				ref_path->current_level = 0;
-			}
-			ret = 0;
-			goto out;
-		}
-
-		level++;
-		BUG_ON(ref_path->nodes[level] != 0);
-		ref_path->nodes[level] = found_key.offset;
-		ref_path->current_level = level;
-
-		/*
-		 * the reference was created in the running transaction,
-		 * no need to continue walking up.
-		 */
-		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
-			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
-			ref_path->root_generation =
-				btrfs_ref_generation(leaf, ref);
-			ret = 0;
-			goto out;
-		}
-
-		btrfs_release_path(extent_root, path);
-		cond_resched();
-	}
-	/* reached max tree level, but no tree root found. */
-	BUG();
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
-				struct btrfs_root *extent_root,
-				struct btrfs_ref_path *ref_path,
-				u64 extent_start)
-{
-	memset(ref_path, 0, sizeof(*ref_path));
-	ref_path->extent_start = extent_start;
-
-	return __next_ref_path(trans, extent_root, ref_path, 1);
-}
-
-static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root,
-			       struct btrfs_ref_path *ref_path)
-{
-	return __next_ref_path(trans, extent_root, ref_path, 0);
-}
-
-static noinline int get_new_locations(struct inode *reloc_inode,
-				      struct btrfs_key *extent_key,
-				      u64 offset, int no_fragment,
-				      struct disk_extent **extents,
-				      int *nr_extents)
-{
-	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-	struct btrfs_path *path;
-	struct btrfs_file_extent_item *fi;
-	struct extent_buffer *leaf;
-	struct disk_extent *exts = *extents;
-	struct btrfs_key found_key;
-	u64 cur_pos;
-	u64 last_byte;
-	u32 nritems;
-	int nr = 0;
-	int max = *nr_extents;
-	int ret;
-
-	WARN_ON(!no_fragment && *extents);
-	if (!exts) {
-		max = 1;
-		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
-		if (!exts)
-			return -ENOMEM;
-	}
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	cur_pos = extent_key->objectid - offset;
-	last_byte = extent_key->objectid + extent_key->offset;
-	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
-				       cur_pos, 0);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	while (1) {
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				goto out;
-			if (ret > 0)
-				break;
-			leaf = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.offset != cur_pos ||
-		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
-		    found_key.objectid != reloc_inode->i_ino)
-			break;
-
-		fi = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(leaf, fi) !=
-		    BTRFS_FILE_EXTENT_REG ||
-		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
-			break;
-
-		if (nr == max) {
-			struct disk_extent *old = exts;
-			max *= 2;
-			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
-			memcpy(exts, old, sizeof(*exts) * nr);
-			if (old != *extents)
-				kfree(old);
-		}
-
-		exts[nr].disk_bytenr =
-			btrfs_file_extent_disk_bytenr(leaf, fi);
-		exts[nr].disk_num_bytes =
-			btrfs_file_extent_disk_num_bytes(leaf, fi);
-		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
-		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
-		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
-		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
-		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
-									   fi);
-		BUG_ON(exts[nr].offset > 0);
-		BUG_ON(exts[nr].compression || exts[nr].encryption);
-		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
-
-		cur_pos += exts[nr].num_bytes;
-		nr++;
-
-		if (cur_pos + offset >= last_byte)
-			break;
-
-		if (no_fragment) {
-			ret = 1;
-			goto out;
-		}
-		path->slots[0]++;
-	}
-
-	BUG_ON(cur_pos + offset > last_byte);
-	if (cur_pos + offset < last_byte) {
-		ret = -ENOENT;
-		goto out;
-	}
-	ret = 0;
-out:
-	btrfs_free_path(path);
-	if (ret) {
-		if (exts != *extents)
-			kfree(exts);
-	} else {
-		*extents = exts;
-		*nr_extents = nr;
-	}
-	return ret;
-}
-
-static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_path *path,
-					struct btrfs_key *extent_key,
-					struct btrfs_key *leaf_key,
-					struct btrfs_ref_path *ref_path,
-					struct disk_extent *new_extents,
-					int nr_extents)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_file_extent_item *fi;
-	struct inode *inode = NULL;
-	struct btrfs_key key;
-	u64 lock_start = 0;
-	u64 lock_end = 0;
-	u64 num_bytes;
-	u64 ext_offset;
-	u64 search_end = (u64)-1;
-	u32 nritems;
-	int nr_scaned = 0;
-	int extent_locked = 0;
-	int extent_type;
-	int ret;
-
-	memcpy(&key, leaf_key, sizeof(key));
-	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-		if (key.objectid < ref_path->owner_objectid ||
-		    (key.objectid == ref_path->owner_objectid &&
-		     key.type < BTRFS_EXTENT_DATA_KEY)) {
-			key.objectid = ref_path->owner_objectid;
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = 0;
-		}
-	}
-
-	while (1) {
-		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-		if (ret < 0)
-			goto out;
-
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-next:
-		if (extent_locked && ret > 0) {
-			/*
-			 * the file extent item was modified by someone
-			 * before the extent got locked.
-			 */
-			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-				      lock_end, GFP_NOFS);
-			extent_locked = 0;
-		}
-
-		if (path->slots[0] >= nritems) {
-			if (++nr_scaned > 2)
-				break;
-
-			BUG_ON(extent_locked);
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				goto out;
-			if (ret > 0)
-				break;
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-		}
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
-		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-			if ((key.objectid > ref_path->owner_objectid) ||
-			    (key.objectid == ref_path->owner_objectid &&
-			     key.type > BTRFS_EXTENT_DATA_KEY) ||
-			    key.offset >= search_end)
-				break;
-		}
-
-		if (inode && key.objectid != inode->i_ino) {
-			BUG_ON(extent_locked);
-			btrfs_release_path(root, path);
-			mutex_unlock(&inode->i_mutex);
-			iput(inode);
-			inode = NULL;
-			continue;
-		}
-
-		if (key.type != BTRFS_EXTENT_DATA_KEY) {
-			path->slots[0]++;
-			ret = 1;
-			goto next;
-		}
-		fi = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-		extent_type = btrfs_file_extent_type(leaf, fi);
-		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
-		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
-		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
-		     extent_key->objectid)) {
-			path->slots[0]++;
-			ret = 1;
-			goto next;
-		}
-
-		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-		ext_offset = btrfs_file_extent_offset(leaf, fi);
-
-		if (search_end == (u64)-1) {
-			search_end = key.offset - ext_offset +
-				btrfs_file_extent_ram_bytes(leaf, fi);
-		}
-
-		if (!extent_locked) {
-			lock_start = key.offset;
-			lock_end = lock_start + num_bytes - 1;
-		} else {
-			if (lock_start > key.offset ||
-			    lock_end + 1 < key.offset + num_bytes) {
-				unlock_extent(&BTRFS_I(inode)->io_tree,
-					      lock_start, lock_end, GFP_NOFS);
-				extent_locked = 0;
-			}
-		}
-
-		if (!inode) {
-			btrfs_release_path(root, path);
-
-			inode = btrfs_iget_locked(root->fs_info->sb,
-						  key.objectid, root);
-			if (inode->i_state & I_NEW) {
-				BTRFS_I(inode)->root = root;
-				BTRFS_I(inode)->location.objectid =
-					key.objectid;
-				BTRFS_I(inode)->location.type =
-					BTRFS_INODE_ITEM_KEY;
-				BTRFS_I(inode)->location.offset = 0;
-				btrfs_read_locked_inode(inode);
-				unlock_new_inode(inode);
-			}
-			/*
-			 * some code call btrfs_commit_transaction while
-			 * holding the i_mutex, so we can't use mutex_lock
-			 * here.
-			 */
-			if (is_bad_inode(inode) ||
-			    !mutex_trylock(&inode->i_mutex)) {
-				iput(inode);
-				inode = NULL;
-				key.offset = (u64)-1;
-				goto skip;
-			}
-		}
-
-		if (!extent_locked) {
-			struct btrfs_ordered_extent *ordered;
-
-			btrfs_release_path(root, path);
-
-			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-				    lock_end, GFP_NOFS);
-			ordered = btrfs_lookup_first_ordered_extent(inode,
-								    lock_end);
-			if (ordered &&
-			    ordered->file_offset <= lock_end &&
-			    ordered->file_offset + ordered->len > lock_start) {
-				unlock_extent(&BTRFS_I(inode)->io_tree,
-					      lock_start, lock_end, GFP_NOFS);
-				btrfs_start_ordered_extent(inode, ordered, 1);
-				btrfs_put_ordered_extent(ordered);
-				key.offset += num_bytes;
-				goto skip;
-			}
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-
-			extent_locked = 1;
-			continue;
-		}
-
-		if (nr_extents == 1) {
-			/* update extent pointer in place */
-			btrfs_set_file_extent_disk_bytenr(leaf, fi,
-						new_extents[0].disk_bytenr);
-			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-						new_extents[0].disk_num_bytes);
-			btrfs_mark_buffer_dirty(leaf);
-
-			btrfs_drop_extent_cache(inode, key.offset,
-						key.offset + num_bytes - 1, 0);
-
-			ret = btrfs_inc_extent_ref(trans, root,
-						new_extents[0].disk_bytenr,
-						new_extents[0].disk_num_bytes,
-						leaf->start,
-						root->root_key.objectid,
-						trans->transid,
-						key.objectid);
-			BUG_ON(ret);
-
-			ret = btrfs_free_extent(trans, root,
-						extent_key->objectid,
-						extent_key->offset,
-						leaf->start,
-						btrfs_header_owner(leaf),
-						btrfs_header_generation(leaf),
-						key.objectid, 0);
-			BUG_ON(ret);
-
-			btrfs_release_path(root, path);
-			key.offset += num_bytes;
-		} else {
-			BUG_ON(1);
-#if 0
-			u64 alloc_hint;
-			u64 extent_len;
-			int i;
-			/*
-			 * drop old extent pointer at first, then insert the
-			 * new pointers one bye one
-			 */
-			btrfs_release_path(root, path);
-			ret = btrfs_drop_extents(trans, root, inode, key.offset,
-						 key.offset + num_bytes,
-						 key.offset, &alloc_hint);
-			BUG_ON(ret);
-
-			for (i = 0; i < nr_extents; i++) {
-				if (ext_offset >= new_extents[i].num_bytes) {
-					ext_offset -= new_extents[i].num_bytes;
-					continue;
-				}
-				extent_len = min(new_extents[i].num_bytes -
-						 ext_offset, num_bytes);
-
-				ret = btrfs_insert_empty_item(trans, root,
-							      path, &key,
-							      sizeof(*fi));
-				BUG_ON(ret);
-
-				leaf = path->nodes[0];
-				fi = btrfs_item_ptr(leaf, path->slots[0],
-						struct btrfs_file_extent_item);
-				btrfs_set_file_extent_generation(leaf, fi,
-							trans->transid);
-				btrfs_set_file_extent_type(leaf, fi,
-							BTRFS_FILE_EXTENT_REG);
-				btrfs_set_file_extent_disk_bytenr(leaf, fi,
-						new_extents[i].disk_bytenr);
-				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-						new_extents[i].disk_num_bytes);
-				btrfs_set_file_extent_ram_bytes(leaf, fi,
-						new_extents[i].ram_bytes);
-
-				btrfs_set_file_extent_compression(leaf, fi,
-						new_extents[i].compression);
-				btrfs_set_file_extent_encryption(leaf, fi,
-						new_extents[i].encryption);
-				btrfs_set_file_extent_other_encoding(leaf, fi,
-						new_extents[i].other_encoding);
-
-				btrfs_set_file_extent_num_bytes(leaf, fi,
-							extent_len);
-				ext_offset += new_extents[i].offset;
-				btrfs_set_file_extent_offset(leaf, fi,
-							ext_offset);
-				btrfs_mark_buffer_dirty(leaf);
-
-				btrfs_drop_extent_cache(inode, key.offset,
-						key.offset + extent_len - 1, 0);
-
-				ret = btrfs_inc_extent_ref(trans, root,
-						new_extents[i].disk_bytenr,
-						new_extents[i].disk_num_bytes,
-						leaf->start,
-						root->root_key.objectid,
-						trans->transid, key.objectid);
-				BUG_ON(ret);
-				btrfs_release_path(root, path);
-
-				inode_add_bytes(inode, extent_len);
-
-				ext_offset = 0;
-				num_bytes -= extent_len;
-				key.offset += extent_len;
-
-				if (num_bytes == 0)
-					break;
-			}
-			BUG_ON(i >= nr_extents);
-#endif
-		}
-
-		if (extent_locked) {
-			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-				      lock_end, GFP_NOFS);
-			extent_locked = 0;
-		}
-skip:
-		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-		    key.offset >= search_end)
-			break;
-
-		cond_resched();
-	}
-	ret = 0;
-out:
-	btrfs_release_path(root, path);
-	if (inode) {
-		mutex_unlock(&inode->i_mutex);
-		if (extent_locked) {
-			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-				      lock_end, GFP_NOFS);
-		}
-		iput(inode);
-	}
-	return ret;
-}
-
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct extent_buffer *buf, u64 orig_start)
-{
-	int level;
-	int ret;
-
-	BUG_ON(btrfs_header_generation(buf) != trans->transid);
-	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
-
-	level = btrfs_header_level(buf);
-	if (level == 0) {
-		struct btrfs_leaf_ref *ref;
-		struct btrfs_leaf_ref *orig_ref;
-
-		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
-		if (!orig_ref)
-			return -ENOENT;
-
-		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
-		if (!ref) {
-			btrfs_free_leaf_ref(root, orig_ref);
-			return -ENOMEM;
-		}
-
-		ref->nritems = orig_ref->nritems;
-		memcpy(ref->extents, orig_ref->extents,
-			sizeof(ref->extents[0]) * ref->nritems);
-
-		btrfs_free_leaf_ref(root, orig_ref);
-
-		ref->root_gen = trans->transid;
-		ref->bytenr = buf->start;
-		ref->owner = btrfs_header_owner(buf);
-		ref->generation = btrfs_header_generation(buf);
-
-		ret = btrfs_add_leaf_ref(root, ref, 0);
-		WARN_ON(ret);
-		btrfs_free_leaf_ref(root, ref);
-	}
-	return 0;
-}
-
-static noinline int invalidate_extent_cache(struct btrfs_root *root,
-					struct extent_buffer *leaf,
-					struct btrfs_block_group_cache *group,
-					struct btrfs_root *target_root)
-{
-	struct btrfs_key key;
-	struct inode *inode = NULL;
-	struct btrfs_file_extent_item *fi;
-	struct extent_state *cached_state = NULL;
-	u64 num_bytes;
-	u64 skip_objectid = 0;
-	u32 nritems;
-	u32 i;
-
-	nritems = btrfs_header_nritems(leaf);
-	for (i = 0; i < nritems; i++) {
-		btrfs_item_key_to_cpu(leaf, &key, i);
-		if (key.objectid == skip_objectid ||
-		    key.type != BTRFS_EXTENT_DATA_KEY)
-			continue;
-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			continue;
-		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
-			continue;
-		if (!inode || inode->i_ino != key.objectid) {
-			iput(inode);
-			inode = btrfs_ilookup(target_root->fs_info->sb,
-					      key.objectid, target_root, 1);
-		}
-		if (!inode) {
-			skip_objectid = key.objectid;
-			continue;
-		}
-		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
-				 key.offset + num_bytes - 1, 0, &cached_state,
-				 GFP_NOFS);
-		btrfs_drop_extent_cache(inode, key.offset,
-					key.offset + num_bytes - 1, 1);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
-				     key.offset + num_bytes - 1, &cached_state,
-				     GFP_NOFS);
-		cond_resched();
-	}
-	iput(inode);
-	return 0;
-}
-
-static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct extent_buffer *leaf,
-					struct btrfs_block_group_cache *group,
-					struct inode *reloc_inode)
-{
-	struct btrfs_key key;
-	struct btrfs_key extent_key;
-	struct btrfs_file_extent_item *fi;
-	struct btrfs_leaf_ref *ref;
-	struct disk_extent *new_extent;
-	u64 bytenr;
-	u64 num_bytes;
-	u32 nritems;
-	u32 i;
-	int ext_index;
-	int nr_extent;
-	int ret;
-
-	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
-	BUG_ON(!new_extent);
-
-	ref = btrfs_lookup_leaf_ref(root, leaf->start);
-	BUG_ON(!ref);
-
-	ext_index = -1;
-	nritems = btrfs_header_nritems(leaf);
-	for (i = 0; i < nritems; i++) {
-		btrfs_item_key_to_cpu(leaf, &key, i);
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-			continue;
-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			continue;
-		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-		if (bytenr == 0)
-			continue;
-
-		ext_index++;
-		if (bytenr >= group->key.objectid + group->key.offset ||
-		    bytenr + num_bytes <= group->key.objectid)
-			continue;
-
-		extent_key.objectid = bytenr;
-		extent_key.offset = num_bytes;
-		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
-		nr_extent = 1;
-		ret = get_new_locations(reloc_inode, &extent_key,
-					group->key.objectid, 1,
-					&new_extent, &nr_extent);
-		if (ret > 0)
-			continue;
-		BUG_ON(ret < 0);
-
-		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
-		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
-		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
-		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
-
-		btrfs_set_file_extent_disk_bytenr(leaf, fi,
-						new_extent->disk_bytenr);
-		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-						new_extent->disk_num_bytes);
-		btrfs_mark_buffer_dirty(leaf);
-
-		ret = btrfs_inc_extent_ref(trans, root,
-					new_extent->disk_bytenr,
-					new_extent->disk_num_bytes,
-					leaf->start,
-					root->root_key.objectid,
-					trans->transid, key.objectid);
-		BUG_ON(ret);
-
-		ret = btrfs_free_extent(trans, root,
-					bytenr, num_bytes, leaf->start,
-					btrfs_header_owner(leaf),
-					btrfs_header_generation(leaf),
-					key.objectid, 0);
-		BUG_ON(ret);
-		cond_resched();
-	}
-	kfree(new_extent);
-	BUG_ON(ext_index + 1 != ref->nritems);
-	btrfs_free_leaf_ref(root, ref);
-	return 0;
-}
-
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root)
-{
-	struct btrfs_root *reloc_root;
-	int ret;
-
-	if (root->reloc_root) {
-		reloc_root = root->reloc_root;
-		root->reloc_root = NULL;
-		list_add(&reloc_root->dead_list,
-			 &root->fs_info->dead_reloc_roots);
-
-		btrfs_set_root_bytenr(&reloc_root->root_item,
-				      reloc_root->node->start);
-		btrfs_set_root_level(&root->root_item,
-				     btrfs_header_level(reloc_root->node));
-		memset(&reloc_root->root_item.drop_progress, 0,
-			sizeof(struct btrfs_disk_key));
-		reloc_root->root_item.drop_level = 0;
-
-		ret = btrfs_update_root(trans, root->fs_info->tree_root,
-					&reloc_root->root_key,
-					&reloc_root->root_item);
-		BUG_ON(ret);
-	}
-	return 0;
-}
-
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *reloc_root;
-	struct btrfs_root *prev_root = NULL;
-	struct list_head dead_roots;
-	int ret;
-	unsigned long nr;
-
-	INIT_LIST_HEAD(&dead_roots);
-	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
-
-	while (!list_empty(&dead_roots)) {
-		reloc_root = list_entry(dead_roots.prev,
-					struct btrfs_root, dead_list);
-		list_del_init(&reloc_root->dead_list);
-
-		BUG_ON(reloc_root->commit_root != NULL);
-		while (1) {
-			trans = btrfs_join_transaction(root, 1);
-			BUG_ON(IS_ERR(trans));
-
-			mutex_lock(&root->fs_info->drop_mutex);
-			ret = btrfs_drop_snapshot(trans, reloc_root);
-			if (ret != -EAGAIN)
-				break;
-			mutex_unlock(&root->fs_info->drop_mutex);
-
-			nr = trans->blocks_used;
-			ret = btrfs_end_transaction(trans, root);
-			BUG_ON(ret);
-			btrfs_btree_balance_dirty(root, nr);
-		}
-
-		free_extent_buffer(reloc_root->node);
-
-		ret = btrfs_del_root(trans, root->fs_info->tree_root,
-				     &reloc_root->root_key);
-		BUG_ON(ret);
-		mutex_unlock(&root->fs_info->drop_mutex);
-
-		nr = trans->blocks_used;
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
-		btrfs_btree_balance_dirty(root, nr);
-
-		kfree(prev_root);
-		prev_root = reloc_root;
-	}
-	if (prev_root) {
-		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
-		kfree(prev_root);
-	}
-	return 0;
-}
-
-int btrfs_add_dead_reloc_root(struct btrfs_root *root)
-{
-	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
-	return 0;
-}
-
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
-{
-	struct btrfs_root *reloc_root;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key location;
-	int found;
-	int ret;
-
-	mutex_lock(&root->fs_info->tree_reloc_mutex);
-	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
-	BUG_ON(ret);
-	found = !list_empty(&root->fs_info->dead_reloc_roots);
-	mutex_unlock(&root->fs_info->tree_reloc_mutex);
-
-	if (found) {
-		trans = btrfs_start_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		ret = btrfs_commit_transaction(trans, root);
-		BUG_ON(ret);
-	}
-
-	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
-	location.offset = (u64)-1;
-	location.type = BTRFS_ROOT_ITEM_KEY;
-
-	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-	BUG_ON(!reloc_root);
-	btrfs_orphan_cleanup(reloc_root);
-	return 0;
-}
-
-static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root)
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
-	struct btrfs_root *reloc_root;
-	struct extent_buffer *eb;
-	struct btrfs_root_item *root_item;
-	struct btrfs_key root_key;
-	int ret;
-
-	BUG_ON(!root->ref_cows);
-	if (root->reloc_root)
-		return 0;
-
-	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-	BUG_ON(!root_item);
-
-	ret = btrfs_copy_root(trans, root, root->commit_root,
-			      &eb, BTRFS_TREE_RELOC_OBJECTID);
-	BUG_ON(ret);
-
-	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
-	root_key.offset = root->root_key.objectid;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-
-	memcpy(root_item, &root->root_item, sizeof(root_item));
-	btrfs_set_root_refs(root_item, 0);
-	btrfs_set_root_bytenr(root_item, eb->start);
-	btrfs_set_root_level(root_item, btrfs_header_level(eb));
-	btrfs_set_root_generation(root_item, trans->transid);
-
-	btrfs_tree_unlock(eb);
-	free_extent_buffer(eb);
-
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
-				&root_key, root_item);
-	BUG_ON(ret);
-	kfree(root_item);
-
-	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						 &root_key);
-	BUG_ON(!reloc_root);
-	reloc_root->last_trans = trans->transid;
-	reloc_root->commit_root = NULL;
-	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
-
-	root->reloc_root = reloc_root;
-	return 0;
-}
-
-/*
- * Core function of space balance.
- *
- * The idea is using reloc trees to relocate tree blocks in reference
- * counted roots. There is one reloc tree for each subvol, and all
- * reloc trees share same root key objectid. Reloc trees are snapshots
- * of the latest committed roots of subvols (root->commit_root).
- *
- * To relocate a tree block referenced by a subvol, there are two steps.
- * COW the block through subvol's reloc tree, then update block pointer
- * in the subvol to point to the new block. Since all reloc trees share
- * same root key objectid, doing special handing for tree blocks owned
- * by them is easy. Once a tree block has been COWed in one reloc tree,
- * we can use the resulting new block directly when the same block is
- * required to COW again through other reloc trees. By this way, relocated
- * tree blocks are shared between reloc trees, so they are also shared
- * between subvols.
- */
-static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct btrfs_key *first_key,
-				      struct btrfs_ref_path *ref_path,
-				      struct btrfs_block_group_cache *group,
-				      struct inode *reloc_inode)
-{
-	struct btrfs_root *reloc_root;
-	struct extent_buffer *eb = NULL;
-	struct btrfs_key *keys;
-	u64 *nodes;
-	int level;
-	int shared_level;
-	int lowest_level = 0;
-	int ret;
-
-	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-		lowest_level = ref_path->owner_objectid;
-
-	if (!root->ref_cows) {
-		path->lowest_level = lowest_level;
-		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
-		BUG_ON(ret < 0);
-		path->lowest_level = 0;
-		btrfs_release_path(root, path);
-		return 0;
-	}
-
-	mutex_lock(&root->fs_info->tree_reloc_mutex);
-	ret = init_reloc_tree(trans, root);
-	BUG_ON(ret);
-	reloc_root = root->reloc_root;
-
-	shared_level = ref_path->shared_level;
-	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
-
-	keys = ref_path->node_keys;
-	nodes = ref_path->new_nodes;
-	memset(&keys[shared_level + 1], 0,
-	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
-	memset(&nodes[shared_level + 1], 0,
-	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
-
-	if (nodes[lowest_level] == 0) {
-		path->lowest_level = lowest_level;
-		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
-					0, 1);
-		BUG_ON(ret);
-		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
-			eb = path->nodes[level];
-			if (!eb || eb == reloc_root->node)
-				break;
-			nodes[level] = eb->start;
-			if (level == 0)
-				btrfs_item_key_to_cpu(eb, &keys[level], 0);
-			else
-				btrfs_node_key_to_cpu(eb, &keys[level], 0);
-		}
-		if (nodes[0] &&
-		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-			eb = path->nodes[0];
-			ret = replace_extents_in_leaf(trans, reloc_root, eb,
-						      group, reloc_inode);
-			BUG_ON(ret);
-		}
-		btrfs_release_path(reloc_root, path);
-	} else {
-		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
-				       lowest_level);
-		BUG_ON(ret);
-	}
+	u64 num_devices;
+	u64 stripped;
 
 	/*
-	 * replace tree blocks in the fs tree with tree blocks in
-	 * the reloc tree.
+	 * if restripe for this chunk_type is on pick target profile and
+	 * return, otherwise do the usual balance
 	 */
-	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
-	BUG_ON(ret < 0);
-
-	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
-					0, 0);
-		BUG_ON(ret);
-		extent_buffer_get(path->nodes[0]);
-		eb = path->nodes[0];
-		btrfs_release_path(reloc_root, path);
-		ret = invalidate_extent_cache(reloc_root, eb, group, root);
-		BUG_ON(ret);
-		free_extent_buffer(eb);
-	}
-
-	mutex_unlock(&root->fs_info->tree_reloc_mutex);
-	path->lowest_level = 0;
-	return 0;
-}
-
-static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_path *path,
-					struct btrfs_key *first_key,
-					struct btrfs_ref_path *ref_path)
-{
-	int ret;
-
-	ret = relocate_one_path(trans, root, path, first_key,
-				ref_path, NULL, NULL);
-	BUG_ON(ret);
-
-	return 0;
-}
-
-static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *extent_root,
-				    struct btrfs_path *path,
-				    struct btrfs_key *extent_key)
-{
-	int ret;
-
-	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
-	if (ret)
-		goto out;
-	ret = btrfs_del_item(trans, extent_root, path);
-out:
-	btrfs_release_path(extent_root, path);
-	return ret;
-}
-
-static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
-						struct btrfs_ref_path *ref_path)
-{
-	struct btrfs_key root_key;
-
-	root_key.objectid = ref_path->root_objectid;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	if (is_cowonly_root(ref_path->root_objectid))
-		root_key.offset = 0;
-	else
-		root_key.offset = (u64)-1;
-
-	return btrfs_read_fs_root_no_name(fs_info, &root_key);
-}
-
-static noinline int relocate_one_extent(struct btrfs_root *extent_root,
-					struct btrfs_path *path,
-					struct btrfs_key *extent_key,
-					struct btrfs_block_group_cache *group,
-					struct inode *reloc_inode, int pass)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *found_root;
-	struct btrfs_ref_path *ref_path = NULL;
-	struct disk_extent *new_extents = NULL;
-	int nr_extents = 0;
-	int loops;
-	int ret;
-	int level;
-	struct btrfs_key first_key;
-	u64 prev_block = 0;
-
-
-	trans = btrfs_start_transaction(extent_root, 1);
-	BUG_ON(IS_ERR(trans));
-
-	if (extent_key->objectid == 0) {
-		ret = del_extent_zero(trans, extent_root, path, extent_key);
-		goto out;
-	}
-
-	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
-	if (!ref_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	for (loops = 0; ; loops++) {
-		if (loops == 0) {
-			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
-						   extent_key->objectid);
-		} else {
-			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
-		}
-		if (ret < 0)
-			goto out;
-		if (ret > 0)
-			break;
-
-		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-			continue;
-
-		found_root = read_ref_root(extent_root->fs_info, ref_path);
-		BUG_ON(!found_root);
-		/*
-		 * for reference counted tree, only process reference paths
-		 * rooted at the latest committed root.
-		 */
-		if (found_root->ref_cows &&
-		    ref_path->root_generation != found_root->root_key.offset)
-			continue;
-
-		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-			if (pass == 0) {
-				/*
-				 * copy data extents to new locations
-				 */
-				u64 group_start = group->key.objectid;
-				ret = relocate_data_extent(reloc_inode,
-							   extent_key,
-							   group_start);
-				if (ret < 0)
-					goto out;
-				break;
-			}
-			level = 0;
-		} else {
-			level = ref_path->owner_objectid;
-		}
-
-		if (prev_block != ref_path->nodes[level]) {
-			struct extent_buffer *eb;
-			u64 block_start = ref_path->nodes[level];
-			u64 block_size = btrfs_level_size(found_root, level);
-
-			eb = read_tree_block(found_root, block_start,
-					     block_size, 0);
-			btrfs_tree_lock(eb);
-			BUG_ON(level != btrfs_header_level(eb));
-
-			if (level == 0)
-				btrfs_item_key_to_cpu(eb, &first_key, 0);
-			else
-				btrfs_node_key_to_cpu(eb, &first_key, 0);
-
-			btrfs_tree_unlock(eb);
-			free_extent_buffer(eb);
-			prev_block = block_start;
-		}
-
-		mutex_lock(&extent_root->fs_info->trans_mutex);
-		btrfs_record_root_in_trans(found_root);
-		mutex_unlock(&extent_root->fs_info->trans_mutex);
-		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-			/*
-			 * try to update data extent references while
-			 * keeping metadata shared between snapshots.
-			 */
-			if (pass == 1) {
-				ret = relocate_one_path(trans, found_root,
-						path, &first_key, ref_path,
-						group, reloc_inode);
-				if (ret < 0)
-					goto out;
-				continue;
-			}
-			/*
-			 * use fallback method to process the remaining
-			 * references.
-			 */
-			if (!new_extents) {
-				u64 group_start = group->key.objectid;
-				new_extents = kmalloc(sizeof(*new_extents),
-						      GFP_NOFS);
-				nr_extents = 1;
-				ret = get_new_locations(reloc_inode,
-							extent_key,
-							group_start, 1,
-							&new_extents,
-							&nr_extents);
-				if (ret)
-					goto out;
-			}
-			ret = replace_one_extent(trans, found_root,
-						path, extent_key,
-						&first_key, ref_path,
-						new_extents, nr_extents);
-		} else {
-			ret = relocate_tree_block(trans, found_root, path,
-						  &first_key, ref_path);
-		}
-		if (ret < 0)
-			goto out;
-	}
-	ret = 0;
-out:
-	btrfs_end_transaction(trans, extent_root);
-	kfree(new_extents);
-	kfree(ref_path);
-	return ret;
-}
-#endif
-
-static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
-{
-	u64 num_devices;
-	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
-		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+	stripped = get_restripe_target(root->fs_info, flags);
+	if (stripped)
+		return extended_to_chunk(stripped);
 
 	/*
 	 * we add in the count of missing devices because we want
@@ -7986,6 +8189,10 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
 
+	stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -7998,7 +8205,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
 			     BTRFS_BLOCK_GROUP_RAID10))
 			return stripped | BTRFS_BLOCK_GROUP_DUP;
-		return flags;
 	} else {
 		/* they already had raid on here, just return */
 		if (flags & stripped)
@@ -8011,36 +8217,51 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 		if (flags & BTRFS_BLOCK_GROUP_DUP)
 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
 
-		/* turn single device chunks into raid0 */
-		return stripped | BTRFS_BLOCK_GROUP_RAID0;
+		/* this is drive concat, leave it alone */
 	}
+
 	return flags;
 }
 
-static int set_block_group_ro(struct btrfs_block_group_cache *cache)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;
+	u64 min_allocable_bytes;
 	int ret = -ENOSPC;
 
-	if (cache->ro)
-		return 0;
+
+	/*
+	 * We need some metadata space and system metadata space for
+	 * allocating chunks in some corner cases until we force to set
+	 * it to be readonly.
+	 */
+	if ((sinfo->flags &
+	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+	    !force)
+		min_allocable_bytes = 1 * 1024 * 1024;
+	else
+		min_allocable_bytes = 0;
 
 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);
+
+	if (cache->ro) {
+		ret = 0;
+		goto out;
+	}
+
 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+	    min_allocable_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_reserved += cache->reserved_pinned;
-		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
-
+out:
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
 	return ret;
@@ -8056,21 +8277,27 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
 	BUG_ON(cache->ro);
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	alloc_flags = update_block_group_flags(root, cache->flags);
-	if (alloc_flags != cache->flags)
-		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+	if (alloc_flags != cache->flags) {
+		ret = do_chunk_alloc(trans, root, alloc_flags,
+				     CHUNK_ALLOC_FORCE);
+		if (ret < 0)
+			goto out;
+	}
 
-	ret = set_block_group_ro(cache);
+	ret = set_block_group_ro(cache, 0);
 	if (!ret)
 		goto out;
 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+	ret = do_chunk_alloc(trans, root, alloc_flags,
+			     CHUNK_ALLOC_FORCE);
 	if (ret < 0)
 		goto out;
-	ret = set_block_group_ro(cache);
+	ret = set_block_group_ro(cache, 0);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -8080,7 +8307,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 type)
 {
 	u64 alloc_flags = get_alloc_profile(root, type);
-	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+	return do_chunk_alloc(trans, root, alloc_flags,
+			      CHUNK_ALLOC_FORCE);
 }
 
 /*
@@ -8129,7 +8357,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 
 	spin_lock(&sinfo->lock);
 
-	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		if (!list_empty(&sinfo->block_groups[i]))
 			free_bytes += __btrfs_get_ro_block_group_free_space(
 						&sinfo->block_groups[i]);
@@ -8139,7 +8367,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 	return free_bytes;
 }
 
-int btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_set_block_group_rw(struct btrfs_root *root,
 			      struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
@@ -8155,7 +8383,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,
 	cache->ro = 0;
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
-	return 0;
 }
 
 /*
@@ -8170,6 +8397,12 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_device *device;
+	struct btrfs_trans_handle *trans;
+	u64 min_free;
+	u64 dev_min = 1;
+	u64 dev_nr = 0;
+	u64 target;
+	int index;
 	int full = 0;
 	int ret = 0;
 
@@ -8179,8 +8412,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	if (!block_group)
 		return -1;
 
+	min_free = btrfs_block_group_used(&block_group->item);
+
 	/* no bytes used, we're good */
-	if (!btrfs_block_group_used(&block_group->item))
+	if (!min_free)
 		goto out;
 
 	space_info = block_group->space_info;
@@ -8196,10 +8431,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	 * all of the extents from this block group.  If we can, we're good
 	 */
 	if ((space_info->total_bytes != block_group->key.offset) &&
-	   (space_info->bytes_used + space_info->bytes_reserved +
-	    space_info->bytes_pinned + space_info->bytes_readonly +
-	    btrfs_block_group_used(&block_group->item) <
-	    space_info->total_bytes)) {
+	    (space_info->bytes_used + space_info->bytes_reserved +
+	     space_info->bytes_pinned + space_info->bytes_readonly +
+	     min_free < space_info->total_bytes)) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
@@ -8208,32 +8442,78 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	/*
 	 * ok we don't have enough space, but maybe we have free space on our
 	 * devices to allocate new chunks for relocation, so loop through our
-	 * alloc devices and guess if we have enough space.  However, if we
-	 * were marked as full, then we know there aren't enough chunks, and we
-	 * can just return.
+	 * alloc devices and guess if we have enough space.  if this block
+	 * group is going to be restriped, run checks against the target
+	 * profile instead of the current one.
 	 */
 	ret = -1;
-	if (full)
+
+	/*
+	 * index:
+	 *      0: raid10
+	 *      1: raid1
+	 *      2: dup
+	 *      3: raid0
+	 *      4: single
+	 */
+	target = get_restripe_target(root->fs_info, block_group->flags);
+	if (target) {
+		index = __get_raid_index(extended_to_chunk(target));
+	} else {
+		/*
+		 * this is just a balance, so if we were marked as full
+		 * we know there is no space for a new chunk
+		 */
+		if (full)
+			goto out;
+
+		index = get_block_group_index(block_group);
+	}
+
+	if (index == BTRFS_RAID_RAID10) {
+		dev_min = 4;
+		/* Divide by 2 */
+		min_free >>= 1;
+	} else if (index == BTRFS_RAID_RAID1) {
+		dev_min = 2;
+	} else if (index == BTRFS_RAID_DUP) {
+		/* Multiply by 2 */
+		min_free <<= 1;
+	} else if (index == BTRFS_RAID_RAID0) {
+		dev_min = fs_devices->rw_devices;
+		do_div(min_free, dev_min);
+	}
+
+	/* We need to do this so that we can look at pending chunks */
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
 		goto out;
+	}
 
 	mutex_lock(&root->fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-		u64 min_free = btrfs_block_group_used(&block_group->item);
 		u64 dev_offset;
 
 		/*
 		 * check to make sure we can actually find a chunk with enough
 		 * space to fit our block group in.
 		 */
-		if (device->total_bytes > device->bytes_used + min_free) {
-			ret = find_free_dev_extent(NULL, device, min_free,
+		if (device->total_bytes > device->bytes_used + min_free &&
+		    !device->is_tgtdev_for_dev_replace) {
+			ret = find_free_dev_extent(trans, device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
+				dev_nr++;
+
+			if (dev_nr >= dev_min)
 				break;
+
 			ret = -1;
 		}
 	}
 	mutex_unlock(&root->fs_info->chunk_mutex);
+	btrfs_end_transaction(trans, root);
 out:
 	btrfs_put_block_group(block_group);
 	return ret;
@@ -8316,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	struct btrfs_caching_control *caching_ctl;
 	struct rb_node *n;
 
-	down_write(&info->extent_commit_sem);
+	down_write(&info->commit_root_sem);
 	while (!list_empty(&info->caching_block_groups)) {
 		caching_ctl = list_entry(info->caching_block_groups.next,
 					 struct btrfs_caching_control, list);
 		list_del(&caching_ctl->list);
 		put_caching_control(caching_ctl);
 	}
-	up_write(&info->extent_commit_sem);
+	up_write(&info->commit_root_sem);
 
 	spin_lock(&info->block_group_cache_lock);
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8344,7 +8624,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		 * We haven't cached this block group, which means we could
 		 * possibly have excluded extents on this block group.
 		 */
-		if (block_group->cached == BTRFS_CACHE_NO)
+		if (block_group->cached == BTRFS_CACHE_NO ||
+		    block_group->cached == BTRFS_CACHE_ERROR)
 			free_excluded_extents(info->extent_root, block_group);
 
 		btrfs_remove_free_space_cache(block_group);
@@ -8364,17 +8645,31 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
 	release_global_block_rsv(info);
 
-	while(!list_empty(&info->space_info)) {
+	while (!list_empty(&info->space_info)) {
+		int i;
+
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
-		if (space_info->bytes_pinned > 0 ||
-		    space_info->bytes_reserved > 0) {
-			WARN_ON(1);
-			dump_space_info(space_info, 0, 0);
+		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+			if (WARN_ON(space_info->bytes_pinned > 0 ||
+			    space_info->bytes_reserved > 0 ||
+			    space_info->bytes_may_use > 0)) {
+				dump_space_info(space_info, 0, 0);
+			}
 		}
 		list_del(&space_info->list);
-		kfree(space_info);
+		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+			struct kobject *kobj;
+			kobj = space_info->block_group_kobjs[i];
+			space_info->block_group_kobjs[i] = NULL;
+			if (kobj) {
+				kobject_del(kobj);
+				kobject_put(kobj);
+			}
+		}
+		kobject_del(&space_info->kobj);
+		kobject_put(&space_info->kobj);
 	}
 	return 0;
 }
@@ -8383,10 +8678,71 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 			       struct btrfs_block_group_cache *cache)
 {
 	int index = get_block_group_index(cache);
+	bool first = false;
 
 	down_write(&space_info->groups_sem);
+	if (list_empty(&space_info->block_groups[index]))
+		first = true;
 	list_add_tail(&cache->list, &space_info->block_groups[index]);
 	up_write(&space_info->groups_sem);
+
+	if (first) {
+		struct raid_kobject *rkobj;
+		int ret;
+
+		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+		if (!rkobj)
+			goto out_err;
+		rkobj->raid_type = index;
+		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+				  "%s", get_raid_name(index));
+		if (ret) {
+			kobject_put(&rkobj->kobj);
+			goto out_err;
+		}
+		space_info->block_group_kobjs[index] = &rkobj->kobj;
+	}
+
+	return;
+out_err:
+	pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
+}
+
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = start;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       start);
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	init_rwsem(&cache->data_rwsem);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
 }
 
 int btrfs_read_block_groups(struct btrfs_root *root)
@@ -8409,15 +8765,14 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
-	if (cache_gen != 0 &&
-	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+	if (btrfs_test_opt(root, SPACE_CACHE) &&
+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
-	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
-		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
@@ -8425,48 +8780,56 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		if (ret != 0)
 			goto error;
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		cache = kzalloc(sizeof(*cache), GFP_NOFS);
+
+		cache = btrfs_create_block_group_cache(root, found_key.objectid,
+						       found_key.offset);
 		if (!cache) {
 			ret = -ENOMEM;
 			goto error;
 		}
 
-		atomic_set(&cache->count, 1);
-		spin_lock_init(&cache->lock);
-		spin_lock_init(&cache->tree_lock);
-		cache->fs_info = info;
-		INIT_LIST_HEAD(&cache->list);
-		INIT_LIST_HEAD(&cache->cluster_list);
-
-		if (need_clear)
+		if (need_clear) {
+			/*
+			 * When we mount with old space cache, we need to
+			 * set BTRFS_DC_CLEAR and set dirty flag.
+			 *
+			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+			 *    truncate the old free space cache inode and
+			 *    setup a new one.
+			 * b) Setting 'dirty flag' makes sure that we flush
+			 *    the new space cache info onto disk.
+			 */
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
-
-		/*
-		 * we only want to have 32k of ram per block group for keeping
-		 * track of free space, and if we pass 1/2 of that we want to
-		 * start converting things over to using bitmaps
-		 */
-		cache->extents_thresh = ((1024 * 32) / 2) /
-			sizeof(struct btrfs_free_space);
+			if (btrfs_test_opt(root, SPACE_CACHE))
+				cache->dirty = 1;
+		}
 
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
-		memcpy(&cache->key, &found_key, sizeof(found_key));
+		cache->flags = btrfs_block_group_flags(&cache->item);
 
 		key.objectid = found_key.objectid + found_key.offset;
-		btrfs_release_path(root, path);
-		cache->flags = btrfs_block_group_flags(&cache->item);
-		cache->sectorsize = root->sectorsize;
+		btrfs_release_path(path);
 
 		/*
 		 * We need to exclude the super stripes now so that the space
 		 * info has super bytes accounted for, otherwise we'll think
 		 * we have more space than we actually do.
 		 */
-		exclude_super_stripes(root, cache);
+		ret = exclude_super_stripes(root, cache);
+		if (ret) {
+			/*
+			 * We may have excluded something, so call this just in
+			 * case.
+			 */
+			free_excluded_extents(root, cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
 
 		/*
 		 * check for two cases, either we are full, and therefore
@@ -8489,10 +8852,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			free_excluded_extents(root, cache);
 		}
 
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
 					&space_info);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			spin_lock(&info->block_group_cache_lock);
+			rb_erase(&cache->cache_node,
+				 &info->block_group_cache_tree);
+			spin_unlock(&info->block_group_cache_lock);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
 		cache->space_info = space_info;
 		spin_lock(&cache->space_info->lock);
 		cache->space_info->bytes_readonly += cache->bytes_super;
@@ -8500,28 +8879,31 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		__link_block_group(space_info, cache);
 
-		ret = btrfs_add_block_group_cache(root->fs_info, cache);
-		BUG_ON(ret);
-
 		set_avail_alloc_bits(root->fs_info, cache->flags);
 		if (btrfs_chunk_readonly(root, cache->key.objectid))
-			set_block_group_ro(cache);
+			set_block_group_ro(cache, 1);
 	}
 
 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
 		if (!(get_alloc_profile(root, space_info->flags) &
 		      (BTRFS_BLOCK_GROUP_RAID10 |
 		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 |
 		       BTRFS_BLOCK_GROUP_DUP)))
 			continue;
 		/*
 		 * avoid allocating from un-mirrored block group if there are
 		 * mirrored block groups.
 		 */
-		list_for_each_entry(cache, &space_info->block_groups[3], list)
-			set_block_group_ro(cache);
-		list_for_each_entry(cache, &space_info->block_groups[4], list)
-			set_block_group_ro(cache);
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_RAID0],
+				list)
+			set_block_group_ro(cache, 1);
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_SINGLE],
+				list)
+			set_block_group_ro(cache, 1);
 	}
 
 	init_global_block_rsv(info);
@@ -8531,6 +8913,38 @@ error:
 	return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item item;
+	struct btrfs_key key;
+	int ret = 0;
+
+	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+				 new_bg_list) {
+		list_del_init(&block_group->new_bg_list);
+
+		if (ret)
+			continue;
+
+		spin_lock(&block_group->lock);
+		memcpy(&item, &block_group->item, sizeof(item));
+		memcpy(&key, &block_group->key, sizeof(key));
+		spin_unlock(&block_group->lock);
+
+		ret = btrfs_insert_item(trans, extent_root, &key, &item,
+					sizeof(item));
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+		ret = btrfs_finish_chunk_alloc(trans, extent_root,
+					       key.objectid, key.offset);
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+	}
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -8542,48 +8956,54 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	extent_root = root->fs_info->extent_root;
 
-	root->fs_info->last_trans_log_full_commit = trans->transid;
+	btrfs_set_log_full_commit(root->fs_info, trans);
 
-	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	cache = btrfs_create_block_group_cache(root, chunk_offset, size);
 	if (!cache)
 		return -ENOMEM;
 
-	cache->key.objectid = chunk_offset;
-	cache->key.offset = size;
-	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	cache->sectorsize = root->sectorsize;
-	cache->fs_info = root->fs_info;
-
-	/*
-	 * we only want to have 32k of ram per block group for keeping track
-	 * of free space, and if we pass 1/2 of that we want to start
-	 * converting things over to using bitmaps
-	 */
-	cache->extents_thresh = ((1024 * 32) / 2) /
-		sizeof(struct btrfs_free_space);
-	atomic_set(&cache->count, 1);
-	spin_lock_init(&cache->lock);
-	spin_lock_init(&cache->tree_lock);
-	INIT_LIST_HEAD(&cache->list);
-	INIT_LIST_HEAD(&cache->cluster_list);
-
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
-	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
 
+	cache->flags = type;
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
-	exclude_super_stripes(root, cache);
+	ret = exclude_super_stripes(root, cache);
+	if (ret) {
+		/*
+		 * We may have excluded something, so call this just in
+		 * case.
+		 */
+		free_excluded_extents(root, cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
 
 	add_new_free_space(cache, root->fs_info, chunk_offset,
 			   chunk_offset + size);
 
 	free_excluded_extents(root, cache);
 
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		rb_erase(&cache->cache_node,
+			 &root->fs_info->block_group_cache_tree);
+		spin_unlock(&root->fs_info->block_group_cache_lock);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+	update_global_block_rsv(root->fs_info);
 
 	spin_lock(&cache->space_info->lock);
 	cache->space_info->bytes_readonly += cache->bytes_super;
@@ -8591,18 +9011,28 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	__link_block_group(cache->space_info, cache);
 
-	ret = btrfs_add_block_group_cache(root->fs_info, cache);
-	BUG_ON(ret);
-
-	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-				sizeof(cache->item));
-	BUG_ON(ret);
+	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
 	return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits &= ~extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start)
 {
@@ -8612,7 +9042,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_key key;
 	struct inode *inode;
+	struct kobject *kobj = NULL;
 	int ret;
+	int index;
 	int factor;
 
 	root = root->fs_info->extent_root;
@@ -8621,7 +9053,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!block_group);
 	BUG_ON(!block_group->ro);
 
+	/*
+	 * Free the reserved super bytes from this block group before
+	 * remove it.
+	 */
+	free_excluded_extents(root, block_group);
+
 	memcpy(&key, &block_group->key, sizeof(key));
+	index = get_block_group_index(block_group);
 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
 				  BTRFS_BLOCK_GROUP_RAID1 |
 				  BTRFS_BLOCK_GROUP_RAID10))
@@ -8645,11 +9084,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&cluster->refill_lock);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	inode = lookup_free_space_inode(root, block_group, path);
+	inode = lookup_free_space_inode(tree_root, block_group, path);
 	if (!IS_ERR(inode)) {
-		btrfs_orphan_add(trans, inode);
+		ret = btrfs_orphan_add(trans, inode);
+		if (ret) {
+			btrfs_add_delayed_iput(inode);
+			goto out;
+		}
 		clear_nlink(inode);
 		/* One for the block groups ref */
 		spin_lock(&block_group->lock);
@@ -8662,7 +9108,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&block_group->lock);
 		}
 		/* One for our lookup ref */
-		iput(inode);
+		btrfs_add_delayed_iput(inode);
 	}
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -8673,17 +9119,20 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 	if (ret > 0)
-		btrfs_release_path(tree_root, path);
+		btrfs_release_path(path);
 	if (ret == 0) {
 		ret = btrfs_del_item(trans, tree_root, path);
 		if (ret)
 			goto out;
-		btrfs_release_path(tree_root, path);
+		btrfs_release_path(path);
 	}
 
 	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
+
+	if (root->fs_info->first_logical_byte == block_group->key.objectid)
+		root->fs_info->first_logical_byte = (u64)-1;
 	spin_unlock(&root->fs_info->block_group_cache_lock);
 
 	down_write(&block_group->space_info->groups_sem);
@@ -8692,7 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
+	if (list_empty(&block_group->space_info->block_groups[index])) {
+		kobj = block_group->space_info->block_group_kobjs[index];
+		block_group->space_info->block_group_kobjs[index] = NULL;
+		clear_avail_alloc_bits(root->fs_info, block_group->flags);
+	}
 	up_write(&block_group->space_info->groups_sem);
+	if (kobj) {
+		kobject_del(kobj);
+		kobject_put(kobj);
+	}
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
 		wait_block_group_cache_done(block_group);
@@ -8724,13 +9182,148 @@ out:
 	return ret;
 }
 
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 flags;
+	int mixed = 0;
+	int ret;
+
+	disk_super = fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		return 1;
+
+	features = btrfs_super_incompat_flags(disk_super);
+	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	if (ret)
+		goto out;
+
+	if (mixed) {
+		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	} else {
+		flags = BTRFS_BLOCK_GROUP_METADATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+		if (ret)
+			goto out;
+
+		flags = BTRFS_BLOCK_GROUP_DATA;
+		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+	}
+out:
+	return ret;
+}
+
 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
 	return unpin_extent_range(root, start, end);
 }
 
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
-			       u64 num_bytes)
+			       u64 num_bytes, u64 *actual_bytes)
+{
+	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+}
+
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 group_trimmed;
+	u64 start;
+	u64 end;
+	u64 trimmed = 0;
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+	int ret = 0;
+
+	/*
+	 * try to trim all FS space, our block group may start from non-zero.
+	 */
+	if (range->len == total_bytes)
+		cache = btrfs_lookup_first_block_group(fs_info, range->start);
+	else
+		cache = btrfs_lookup_block_group(fs_info, range->start);
+
+	while (cache) {
+		if (cache->key.objectid >= (range->start + range->len)) {
+			btrfs_put_block_group(cache);
+			break;
+		}
+
+		start = max(range->start, cache->key.objectid);
+		end = min(range->start + range->len,
+				cache->key.objectid + cache->key.offset);
+
+		if (end - start >= range->minlen) {
+			if (!block_group_cache_done(cache)) {
+				ret = cache_block_group(cache, 0);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
+				ret = wait_block_group_cache_done(cache);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
+			}
+			ret = btrfs_trim_block_group(cache,
+						     &group_trimmed,
+						     start,
+						     end,
+						     range->minlen);
+
+			trimmed += group_trimmed;
+			if (ret) {
+				btrfs_put_block_group(cache);
+				break;
+			}
+		}
+
+		cache = next_block_group(fs_info->tree_root, cache);
+	}
+
+	range->len = trimmed;
+	return ret;
+}
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
 {
-	return btrfs_discard_extent(root, bytenr, num_bytes);
+	percpu_counter_dec(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we wake up
+	 * waiters.
+	 */
+	smp_mb();
+	if (waitqueue_active(&root->subv_writers->wait))
+		wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+	if (unlikely(atomic_read(&root->will_be_snapshoted)))
+		return 0;
+
+	percpu_counter_inc(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we check for snapshot creation.
+	 */
+	smp_mb();
+	if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+		btrfs_end_nocow_write(root);
+		return 0;
+	}
+	return 1;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 714adc4ac4c..a389820d158 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,27 +4,103 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
-#include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
+#include "rcu-string.h"
+#include "backref.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
 
+#ifdef CONFIG_BTRFS_DEBUG
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
-#define LEAK_DEBUG 0
-#if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
+
+static inline
+void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(new, head);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_del(struct list_head *entry)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(entry);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_check(void)
+{
+	struct extent_state *state;
+	struct extent_buffer *eb;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, leak_list);
+		printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       state->start, state->end, state->state, state->tree,
+		       atomic_read(&state->refs));
+		list_del(&state->leak_list);
+		kmem_cache_free(extent_state_cache, state);
+	}
+
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
+		       "refs %d\n",
+		       eb->start, eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end)		\
+	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+		struct extent_io_tree *tree, u64 start, u64 end)
+{
+	struct inode *inode;
+	u64 isize;
+
+	if (!tree->mapping)
+		return;
+
+	inode = tree->mapping->host;
+	isize = i_size_read(inode);
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		printk_ratelimited(KERN_DEBUG
+		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+				caller, btrfs_ino(inode), isize, start, end);
+	}
+}
+#else
+#define btrfs_leak_debug_add(new, head)	do {} while (0)
+#define btrfs_leak_debug_del(entry)	do {} while (0)
+#define btrfs_leak_debug_check()	do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -39,6 +115,7 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+	unsigned long bio_flags;
 
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
@@ -49,75 +126,83 @@ struct extent_page_data {
 	unsigned int sync_io:1;
 };
 
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+	if (!tree->mapping)
+		return NULL;
+	return btrfs_sb(tree->mapping->host->i_sb);
+}
+
 int __init extent_io_init(void)
 {
-	extent_state_cache = kmem_cache_create("extent_state",
+	extent_state_cache = kmem_cache_create("btrfs_extent_state",
 			sizeof(struct extent_state), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 
-	extent_buffer_cache = kmem_cache_create("extent_buffers",
+	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 			sizeof(struct extent_buffer), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
 		goto free_state_cache;
+
+	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+				     offsetof(struct btrfs_io_bio, bio));
+	if (!btrfs_bioset)
+		goto free_buffer_cache;
+
+	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+		goto free_bioset;
+
 	return 0;
 
+free_bioset:
+	bioset_free(btrfs_bioset);
+	btrfs_bioset = NULL;
+
+free_buffer_cache:
+	kmem_cache_destroy(extent_buffer_cache);
+	extent_buffer_cache = NULL;
+
 free_state_cache:
 	kmem_cache_destroy(extent_state_cache);
+	extent_state_cache = NULL;
 	return -ENOMEM;
 }
 
 void extent_io_exit(void)
 {
-	struct extent_state *state;
-	struct extent_buffer *eb;
+	btrfs_leak_debug_check();
 
-	while (!list_empty(&states)) {
-		state = list_entry(states.next, struct extent_state, leak_list);
-		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
-		       "state %lu in tree %p refs %d\n",
-		       (unsigned long long)state->start,
-		       (unsigned long long)state->end,
-		       state->state, state->tree, atomic_read(&state->refs));
-		list_del(&state->leak_list);
-		kmem_cache_free(extent_state_cache, state);
-
-	}
-
-	while (!list_empty(&buffers)) {
-		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
-		       "refs %d\n", (unsigned long long)eb->start,
-		       eb->len, atomic_read(&eb->refs));
-		list_del(&eb->leak_list);
-		kmem_cache_free(extent_buffer_cache, eb);
-	}
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
 	if (extent_state_cache)
 		kmem_cache_destroy(extent_state_cache);
 	if (extent_buffer_cache)
 		kmem_cache_destroy(extent_buffer_cache);
+	if (btrfs_bioset)
+		bioset_free(btrfs_bioset);
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
-			  struct address_space *mapping, gfp_t mask)
+			 struct address_space *mapping)
 {
 	tree->state = RB_ROOT;
-	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
-	spin_lock_init(&tree->buffer_lock);
 	tree->mapping = mapping;
 }
 
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-#if LEAK_DEBUG
-	unsigned long flags;
-#endif
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state)
@@ -125,13 +210,10 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
-#if LEAK_DEBUG
-	spin_lock_irqsave(&leak_lock, flags);
-	list_add(&state->leak_list, &states);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+	btrfs_leak_debug_add(&state->leak_list, &states);
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
+	trace_alloc_extent_state(state, mask, _RET_IP_);
 	return state;
 }
 
@@ -140,26 +222,31 @@ void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-#if LEAK_DEBUG
-		unsigned long flags;
-#endif
 		WARN_ON(state->tree);
-#if LEAK_DEBUG
-		spin_lock_irqsave(&leak_lock, flags);
-		list_del(&state->leak_list);
-		spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+		btrfs_leak_debug_del(&state->leak_list);
+		trace_free_extent_state(state, _RET_IP_);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-				   struct rb_node *node)
+static struct rb_node *tree_insert(struct rb_root *root,
+				   struct rb_node *search_start,
+				   u64 offset,
+				   struct rb_node *node,
+				   struct rb_node ***p_in,
+				   struct rb_node **parent_in)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
+	if (p_in && parent_in) {
+		p = *p_in;
+		parent = *parent_in;
+		goto do_insert;
+	}
+
+	p = search_start ? &search_start : &root->rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -172,36 +259,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 			return parent;
 	}
 
-	entry = rb_entry(node, struct tree_entry, rb_node);
+do_insert:
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 	return NULL;
 }
 
 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
-				     struct rb_node **prev_ret,
-				     struct rb_node **next_ret)
+				      struct rb_node **prev_ret,
+				      struct rb_node **next_ret,
+				      struct rb_node ***p_ret,
+				      struct rb_node **parent_ret)
 {
 	struct rb_root *root = &tree->state;
-	struct rb_node *n = root->rb_node;
+	struct rb_node **n = &root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
-	while (n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
-		prev = n;
+	while (*n) {
+		prev = *n;
+		entry = rb_entry(prev, struct tree_entry, rb_node);
 		prev_entry = entry;
 
 		if (offset < entry->start)
-			n = n->rb_left;
+			n = &(*n)->rb_left;
 		else if (offset > entry->end)
-			n = n->rb_right;
+			n = &(*n)->rb_right;
 		else
-			return n;
+			return *n;
 	}
 
+	if (p_ret)
+		*p_ret = n;
+	if (parent_ret)
+		*parent_ret = prev;
+
 	if (prev_ret) {
 		orig_prev = prev;
 		while (prev && offset > prev_entry->end) {
@@ -223,18 +317,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 	return NULL;
 }
 
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
-					  u64 offset)
+static inline struct rb_node *
+tree_search_for_insert(struct extent_io_tree *tree,
+		       u64 offset,
+		       struct rb_node ***p_ret,
+		       struct rb_node **parent_ret)
 {
 	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 
-	ret = __etree_search(tree, offset, &prev, NULL);
+	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
 	if (!ret)
 		return prev;
 	return ret;
 }
 
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+					  u64 offset)
+{
+	return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
@@ -252,14 +355,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
  *
  * This should be called with the tree lock held.
  */
-static int merge_state(struct extent_io_tree *tree,
-		       struct extent_state *state)
+static void merge_state(struct extent_io_tree *tree,
+		        struct extent_state *state)
 {
 	struct extent_state *other;
 	struct rb_node *other_node;
 
 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
-		return 0;
+		return;
 
 	other_node = rb_prev(&state->rb_node);
 	if (other_node) {
@@ -279,35 +382,31 @@ static int merge_state(struct extent_io_tree *tree,
 		if (other->start == state->end + 1 &&
 		    other->state == state->state) {
 			merge_cb(tree, state, other);
-			other->start = state->start;
-			state->tree = NULL;
-			rb_erase(&state->rb_node, &tree->state);
-			free_extent_state(state);
-			state = NULL;
+			state->end = other->end;
+			other->tree = NULL;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
 		}
 	}
-
-	return 0;
 }
 
-static int set_state_cb(struct extent_io_tree *tree,
-			 struct extent_state *state, int *bits)
+static void set_state_cb(struct extent_io_tree *tree,
+			 struct extent_state *state, unsigned long *bits)
 {
-	if (tree->ops && tree->ops->set_bit_hook) {
-		return tree->ops->set_bit_hook(tree->mapping->host,
-					       state, bits);
-	}
-
-	return 0;
+	if (tree->ops && tree->ops->set_bit_hook)
+		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
-			   struct extent_state *state, int *bits)
+			   struct extent_state *state, unsigned long *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 
+static void set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state, unsigned long *bits);
+
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
  * struct before it is inserted.
@@ -320,36 +419,27 @@ static void clear_state_cb(struct extent_io_tree *tree,
  */
 static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
-			int *bits)
+			struct rb_node ***p,
+			struct rb_node **parent,
+			unsigned long *bits)
 {
 	struct rb_node *node;
-	int bits_to_set = *bits & ~EXTENT_CTLBITS;
-	int ret;
 
-	if (end < start) {
-		printk(KERN_ERR "btrfs end < start %llu %llu\n",
-		       (unsigned long long)end,
-		       (unsigned long long)start);
-		WARN_ON(1);
-	}
+	if (end < start)
+		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
+		       end, start);
 	state->start = start;
 	state->end = end;
-	ret = set_state_cb(tree, state, bits);
-	if (ret)
-		return ret;
 
-	if (bits_to_set & EXTENT_DIRTY)
-		tree->dirty_bytes += end - start + 1;
-	state->state |= bits_to_set;
-	node = tree_insert(&tree->state, end, &state->rb_node);
+	set_state_bits(tree, state, bits);
+
+	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
-		       "%llu %llu\n", (unsigned long long)found->start,
-		       (unsigned long long)found->end,
-		       (unsigned long long)start, (unsigned long long)end);
-		free_extent_state(state);
+		printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
+		       "%llu %llu\n",
+		       found->start, found->end, start, end);
 		return -EEXIST;
 	}
 	state->tree = tree;
@@ -357,13 +447,11 @@ static int insert_state(struct extent_io_tree *tree,
 	return 0;
 }
 
-static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 		     u64 split)
 {
 	if (tree->ops && tree->ops->split_extent_hook)
-		return tree->ops->split_extent_hook(tree->mapping->host,
-						    orig, split);
-	return 0;
+		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 }
 
 /*
@@ -392,7 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	prealloc->state = orig->state;
 	orig->start = split;
 
-	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+			   &prealloc->rb_node, NULL, NULL);
 	if (node) {
 		free_extent_state(prealloc);
 		return -EEXIST;
@@ -401,20 +490,28 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	return 0;
 }
 
+static struct extent_state *next_state(struct extent_state *state)
+{
+	struct rb_node *next = rb_next(&state->rb_node);
+	if (next)
+		return rb_entry(next, struct extent_state, rb_node);
+	else
+		return NULL;
+}
+
 /*
  * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
+ * it will optionally wake up any one waiting on this state (wake == 1).
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
  */
-static int clear_state_bit(struct extent_io_tree *tree,
-			    struct extent_state *state,
-			    int *bits, int wake)
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+					    struct extent_state *state,
+					    unsigned long *bits, int wake)
 {
-	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
-	int ret = state->state & bits_to_clear;
+	struct extent_state *next;
+	unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
 
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
@@ -426,6 +523,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
 	if (wake)
 		wake_up(&state->wq);
 	if (state->state == 0) {
+		next = next_state(state);
 		if (state->tree) {
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
@@ -435,8 +533,25 @@ static int clear_state_bit(struct extent_io_tree *tree,
 		}
 	} else {
 		merge_state(tree, state);
+		next = next_state(state);
 	}
-	return ret;
+	return next;
+}
+
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+	if (!prealloc)
+		prealloc = alloc_extent_state(GFP_ATOMIC);
+
+	return prealloc;
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+		    "Extent tree was modified by another "
+		    "thread while locked.");
 }
 
 /*
@@ -449,24 +564,26 @@ static int clear_state_bit(struct extent_io_tree *tree,
  *
  * the range [start, end] is inclusive.
  *
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete,
+		     unsigned long bits, int wake, int delete,
 		     struct extent_state **cached_state,
 		     gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *cached;
 	struct extent_state *prealloc = NULL;
-	struct rb_node *next_node;
 	struct rb_node *node;
 	u64 last_end;
 	int err;
-	int set = 0;
 	int clear = 0;
 
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
+	if (bits & EXTENT_DELALLOC)
+		bits |= EXTENT_NORESERVE;
+
 	if (delete)
 		bits |= ~EXTENT_CTLBITS;
 	bits |= EXTENT_FIRST_DELALLOC;
@@ -489,7 +606,8 @@ again:
 			cached_state = NULL;
 		}
 
-		if (cached && cached->tree && cached->start == start) {
+		if (cached && cached->tree && cached->start <= start &&
+		    cached->end > start) {
 			if (clear)
 				atomic_dec(&cached->refs);
 			state = cached;
@@ -512,6 +630,12 @@ hit_next:
 	WARN_ON(state->end < start);
 	last_end = state->end;
 
+	/* the state doesn't have the wanted bits, go ahead */
+	if (!(state->state & bits)) {
+		state = next_state(state);
+		goto next;
+	}
+
 	/*
 	 *     | ---- desired range ---- |
 	 *  | state | or
@@ -529,18 +653,18 @@ hit_next:
 	 */
 
 	if (state->start < start) {
-		if (!prealloc)
-			prealloc = alloc_extent_state(GFP_ATOMIC);
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		prealloc = NULL;
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set |= clear_state_bit(tree, state, &bits, wake);
-			if (last_end == (u64)-1)
-				goto out;
-			start = last_end + 1;
+			state = clear_state_bit(tree, state, &bits, wake);
+			goto next;
 		}
 		goto search_again;
 	}
@@ -551,34 +675,28 @@ hit_next:
 	 * on the first half
 	 */
 	if (state->start <= end && state->end > end) {
-		if (!prealloc)
-			prealloc = alloc_extent_state(GFP_ATOMIC);
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		if (wake)
 			wake_up(&state->wq);
 
-		set |= clear_state_bit(tree, prealloc, &bits, wake);
+		clear_state_bit(tree, prealloc, &bits, wake);
 
 		prealloc = NULL;
 		goto out;
 	}
 
-	if (state->end < end && prealloc && !need_resched())
-		next_node = rb_next(&state->rb_node);
-	else
-		next_node = NULL;
-
-	set |= clear_state_bit(tree, state, &bits, wake);
+	state = clear_state_bit(tree, state, &bits, wake);
+next:
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
-	if (start <= end && next_node) {
-		state = rb_entry(next_node, struct extent_state,
-				 rb_node);
-		if (state->start == start)
-			goto hit_next;
-	}
+	if (start <= end && state && !need_resched())
+		goto hit_next;
 	goto search_again;
 
 out:
@@ -586,7 +704,7 @@ out:
 	if (prealloc)
 		free_extent_state(prealloc);
 
-	return set;
+	return 0;
 
 search_again:
 	if (start > end)
@@ -597,8 +715,8 @@ search_again:
 	goto again;
 }
 
-static int wait_on_state(struct extent_io_tree *tree,
-			 struct extent_state *state)
+static void wait_on_state(struct extent_io_tree *tree,
+			  struct extent_state *state)
 		__releases(tree->lock)
 		__acquires(tree->lock)
 {
@@ -608,7 +726,6 @@ static int wait_on_state(struct extent_io_tree *tree,
 	schedule();
 	spin_lock(&tree->lock);
 	finish_wait(&state->wq, &wait);
-	return 0;
 }
 
 /*
@@ -616,11 +733,14 @@ static int wait_on_state(struct extent_io_tree *tree,
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			    unsigned long bits)
 {
 	struct extent_state *state;
 	struct rb_node *node;
 
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
 	spin_lock(&tree->lock);
 again:
 	while (1) {
@@ -629,6 +749,7 @@ again:
 		 * our range starts
 		 */
 		node = tree_search(tree, start);
+process_node:
 		if (!node)
 			break;
 
@@ -649,34 +770,27 @@ again:
 		if (start > end)
 			break;
 
-		if (need_resched()) {
-			spin_unlock(&tree->lock);
-			cond_resched();
-			spin_lock(&tree->lock);
+		if (!cond_resched_lock(&tree->lock)) {
+			node = rb_next(node);
+			goto process_node;
 		}
 	}
 out:
 	spin_unlock(&tree->lock);
-	return 0;
 }
 
-static int set_state_bits(struct extent_io_tree *tree,
+static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
-			   int *bits)
+			   unsigned long *bits)
 {
-	int ret;
-	int bits_to_set = *bits & ~EXTENT_CTLBITS;
+	unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
 
-	ret = set_state_cb(tree, state, bits);
-	if (ret)
-		return ret;
+	set_state_cb(tree, state, bits);
 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
 	state->state |= bits_to_set;
-
-	return 0;
 }
 
 static void cache_state(struct extent_state *state,
@@ -701,29 +815,35 @@ static void cache_state(struct extent_state *state,
  * [start, end] is inclusive This takes the tree lock.
  */
 
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int exclusive_bits, u64 *failed_start,
-		   struct extent_state **cached_state, gfp_t mask)
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		 unsigned long bits, unsigned long exclusive_bits,
+		 u64 *failed_start, struct extent_state **cached_state,
+		 gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	struct rb_node **p;
+	struct rb_node *parent;
 	int err = 0;
 	u64 last_start;
 	u64 last_end;
 
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
 	bits |= EXTENT_FIRST_DELALLOC;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
-		if (!prealloc)
-			return -ENOMEM;
+		BUG_ON(!prealloc);
 	}
 
 	spin_lock(&tree->lock);
 	if (cached_state && *cached_state) {
 		state = *cached_state;
-		if (state->start == start && state->tree) {
+		if (state->start <= start && state->end > start &&
+		    state->tree) {
 			node = &state->rb_node;
 			goto hit_next;
 		}
@@ -732,11 +852,17 @@ again:
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(tree, start);
+	node = tree_search_for_insert(tree, start, &p, &parent);
 	if (!node) {
-		err = insert_state(tree, prealloc, start, end, &bits);
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+		err = insert_state(tree, prealloc, start, end,
+				   &p, &parent, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
@@ -751,32 +877,22 @@ hit_next:
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		struct rb_node *next_node;
 		if (state->state & exclusive_bits) {
 			*failed_start = state->start;
 			err = -EEXIST;
 			goto out;
 		}
 
-		err = set_state_bits(tree, state, &bits);
-		if (err)
-			goto out;
-
+		set_state_bits(tree, state, &bits);
 		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
 			goto out;
-
 		start = last_end + 1;
-		if (start < end && prealloc && !need_resched()) {
-			next_node = rb_next(node);
-			if (next_node) {
-				state = rb_entry(next_node, struct extent_state,
-						 rb_node);
-				if (state->start == start)
-					goto hit_next;
-			}
-		}
+		state = next_state(state);
+		if (start < end && state && state->start == start &&
+		    !need_resched())
+			goto hit_next;
 		goto search_again;
 	}
 
@@ -802,20 +918,27 @@ hit_next:
 			err = -EEXIST;
 			goto out;
 		}
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		prealloc = NULL;
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			err = set_state_bits(tree, state, &bits);
-			if (err)
-				goto out;
+			set_state_bits(tree, state, &bits);
 			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
+			state = next_state(state);
+			if (start < end && state && state->start == start &&
+			    !need_resched())
+				goto hit_next;
 		}
 		goto search_again;
 	}
@@ -832,13 +955,19 @@ hit_next:
 			this_end = end;
 		else
 			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
 		err = insert_state(tree, prealloc, start, this_end,
-				   &bits);
-		BUG_ON(err == -EEXIST);
-		if (err) {
-			prealloc = NULL;
-			goto out;
-		}
+				   NULL, NULL, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		start = this_end + 1;
@@ -856,16 +985,233 @@ hit_next:
 			err = -EEXIST;
 			goto out;
 		}
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned long bits, u64 * failed_start,
+		   struct extent_state **cached_state, gfp_t mask)
+{
+	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+				cached_state, mask);
+}
+
+
+/**
+ * convert_extent_bit - convert all bits in a given range from one bit to
+ * 			another
+ * @tree:	the io tree to search
+ * @start:	the start offset in bytes
+ * @end:	the end offset in bytes (inclusive)
+ * @bits:	the bits to set in this range
+ * @clear_bits:	the bits to clear in this range
+ * @cached_state:	state that we're going to cache
+ * @mask:	the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       unsigned long bits, unsigned long clear_bits,
+		       struct extent_state **cached_state, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	struct rb_node **p;
+	struct rb_node *parent;
+	int err = 0;
+	u64 last_start;
+	u64 last_end;
+
+	btrfs_debug_check_extent_io_range(tree, start, end);
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
 
-		err = set_state_bits(tree, prealloc, &bits);
-		if (err) {
-			prealloc = NULL;
+	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search_for_insert(tree, start, &p, &parent);
+	if (!node) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
 			goto out;
 		}
+		err = insert_state(tree, prealloc, start, end,
+				   &p, &parent, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
 		cache_state(prealloc, cached_state);
-		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set_state_bits(tree, state, &bits);
+		cache_state(state, cached_state);
+		state = clear_state_bit(tree, state, &clear_bits, 0);
+		if (last_end == (u64)-1)
+			goto out;
+		start = last_end + 1;
+		if (start < end && state && state->start == start &&
+		    !need_resched())
+			goto hit_next;
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		if (err)
+			extent_io_tree_panic(tree, err);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, &bits);
+			cache_state(state, cached_state);
+			state = clear_state_bit(tree, state, &clear_bits, 0);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+			if (start < end && state && state->start == start &&
+			    !need_resched())
+				goto hit_next;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
+		err = insert_state(tree, prealloc, start, this_end,
+				   NULL, NULL, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = split_state(tree, state, prealloc, end + 1);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
+		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
+		clear_state_bit(tree, prealloc, &clear_bits, 0);
 		prealloc = NULL;
 		goto out;
 	}
@@ -892,19 +1238,19 @@ search_again:
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
 			      NULL, mask);
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask)
+		    unsigned long bits, gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, bits, 0, NULL,
+	return set_extent_bit(tree, start, end, bits, NULL,
 			      NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask)
+		      unsigned long bits, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
@@ -913,8 +1259,16 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
-			      0, NULL, cached_state, mask);
+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
+			      NULL, cached_state, mask);
+}
+
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+			      NULL, cached_state, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -928,78 +1282,63 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
 			      NULL, mask);
 }
 
-static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
-				NULL, mask);
-}
-
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			gfp_t mask)
+			struct extent_state **cached_state, gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-			      NULL, mask);
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+			      cached_state, mask);
 }
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-				 u64 end, struct extent_state **cached_state,
-				 gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  struct extent_state **cached_state, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
 				cached_state, mask);
 }
 
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
-}
-
 /*
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, struct extent_state **cached_state, gfp_t mask)
+		     unsigned long bits, struct extent_state **cached_state)
 {
 	int err;
 	u64 failed_start;
 	while (1) {
-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
-				     EXTENT_LOCKED, &failed_start,
-				     cached_state, mask);
-		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+				       EXTENT_LOCKED, &failed_start,
+				       cached_state, GFP_NOFS);
+		if (err == -EEXIST) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
-		} else {
+		} else
 			break;
-		}
 		WARN_ON(start > end);
 	}
 	return err;
 }
 
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
-	return lock_extent_bits(tree, start, end, 0, NULL, mask);
+	return lock_extent_bits(tree, start, end, 0, NULL);
 }
 
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-		    gfp_t mask)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	int err;
 	u64 failed_start;
 
-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-			     &failed_start, NULL, mask);
+	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			       &failed_start, NULL, GFP_NOFS);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
-					 EXTENT_LOCKED, 1, 0, NULL, mask);
+					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
 		return 0;
 	}
 	return 1;
@@ -1012,25 +1351,38 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 				mask);
 }
 
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-		  gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
-				mask);
+				GFP_NOFS);
 }
 
-/*
- * helper function to set pages and extents in the tree dirty
- */
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	struct page *page;
 
 	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		clear_page_dirty_for_io(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		account_page_redirty(page);
 		__set_page_dirty_nobuffers(page);
 		page_cache_release(page);
 		index++;
@@ -1049,7 +1401,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 
 	while (index <= end_index) {
 		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
 		set_page_writeback(page);
 		page_cache_release(page);
 		index++;
@@ -1057,21 +1409,17 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 	return 0;
 }
 
-/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
- *
- * If nothing was found, 1 is returned, < 0 on error
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
  */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
+static struct extent_state *
+find_first_extent_bit_state(struct extent_io_tree *tree,
+			    u64 start, unsigned long bits)
 {
 	struct rb_node *node;
 	struct extent_state *state;
-	int ret = 1;
 
-	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -1082,50 +1430,63 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 
 	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits)) {
-			*start_ret = state->start;
-			*end_ret = state->end;
-			ret = 0;
-			break;
-		}
+		if (state->end >= start && (state->state & bits))
+			return state;
+
 		node = rb_next(node);
 		if (!node)
 			break;
 	}
 out:
-	spin_unlock(&tree->lock);
-	return ret;
+	return NULL;
 }
 
-/* find the first state struct with 'bits' set after 'start', and
- * return it.  tree->lock must be held.  NULL will returned if
- * nothing was found after 'start'
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned. If found something, return 0.
  */
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
-						 u64 start, int bits)
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, unsigned long bits,
+			  struct extent_state **cached_state)
 {
-	struct rb_node *node;
 	struct extent_state *state;
+	struct rb_node *n;
+	int ret = 1;
 
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(tree, start);
-	if (!node)
-		goto out;
-
-	while (1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits))
-			return state;
+	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->end == start - 1 && state->tree) {
+			n = rb_next(&state->rb_node);
+			while (n) {
+				state = rb_entry(n, struct extent_state,
+						 rb_node);
+				if (state->state & bits)
+					goto got_it;
+				n = rb_next(n);
+			}
+			free_extent_state(*cached_state);
+			*cached_state = NULL;
+			goto out;
+		}
+		free_extent_state(*cached_state);
+		*cached_state = NULL;
+	}
 
-		node = rb_next(node);
-		if (!node)
-			break;
+	state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+	if (state) {
+		cache_state(state, cached_state);
+		*start_ret = state->start;
+		*end_ret = state->end;
+		ret = 0;
 	}
 out:
-	return NULL;
+	spin_unlock(&tree->lock);
+	return ret;
 }
 
 /*
@@ -1177,20 +1538,20 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 		*end = state->end;
 		cur_start = state->end + 1;
 		node = rb_next(node);
-		if (!node)
-			break;
 		total_bytes += state->end - state->start + 1;
 		if (total_bytes >= max_bytes)
 			break;
+		if (!node)
+			break;
 	}
 out:
 	spin_unlock(&tree->lock);
 	return found;
 }
 
-static noinline int __unlock_for_delalloc(struct inode *inode,
-					  struct page *locked_page,
-					  u64 start, u64 end)
+static noinline void __unlock_for_delalloc(struct inode *inode,
+					   struct page *locked_page,
+					   u64 start, u64 end)
 {
 	int ret;
 	struct page *pages[16];
@@ -1200,7 +1561,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 	int i;
 
 	if (index == locked_page->index && end_index == index)
-		return 0;
+		return;
 
 	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1215,7 +1576,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 		index += ret;
 		cond_resched();
 	}
-	return 0;
 }
 
 static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1286,11 +1646,10 @@ done:
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct inode *inode,
-					     struct extent_io_tree *tree,
-					     struct page *locked_page,
-					     u64 *start, u64 *end,
-					     u64 max_bytes)
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+				    struct extent_io_tree *tree,
+				    struct page *locked_page, u64 *start,
+				    u64 *end, u64 max_bytes)
 {
 	u64 delalloc_start;
 	u64 delalloc_end;
@@ -1309,7 +1668,7 @@ again:
 		*start = delalloc_start;
 		*end = delalloc_end;
 		free_extent_state(cached_state);
-		return found;
+		return 0;
 	}
 
 	/*
@@ -1322,10 +1681,9 @@ again:
 
 	/*
 	 * make sure to limit the number of pages we try to lock down
-	 * if we're looping.
 	 */
-	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
-		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	if (delalloc_end + 1 - delalloc_start > max_bytes)
+		delalloc_end = delalloc_start + max_bytes - 1;
 
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
@@ -1335,9 +1693,9 @@ again:
 		 * shortening the size of the delalloc range we're searching
 		 */
 		free_extent_state(cached_state);
+		cached_state = NULL;
 		if (!loops) {
-			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
-			max_bytes = PAGE_CACHE_SIZE - offset;
+			max_bytes = PAGE_CACHE_SIZE;
 			loops = 1;
 			goto again;
 		} else {
@@ -1345,11 +1703,10 @@ again:
 			goto out_failed;
 		}
 	}
-	BUG_ON(ret);
+	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
 
 	/* step three, lock the state bits for the whole range */
-	lock_extent_bits(tree, delalloc_start, delalloc_end,
-			 0, &cached_state, GFP_NOFS);
+	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1369,31 +1726,21 @@ out_failed:
 	return found;
 }
 
-int extent_clear_unlock_delalloc(struct inode *inode,
-				struct extent_io_tree *tree,
-				u64 start, u64 end, struct page *locked_page,
-				unsigned long op)
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned long clear_bits,
+				 unsigned long page_ops)
 {
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	int ret;
 	struct page *pages[16];
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
-	int clear_bits = 0;
-
-	if (op & EXTENT_CLEAR_UNLOCK)
-		clear_bits |= EXTENT_LOCKED;
-	if (op & EXTENT_CLEAR_DIRTY)
-		clear_bits |= EXTENT_DIRTY;
-
-	if (op & EXTENT_CLEAR_DELALLOC)
-		clear_bits |= EXTENT_DELALLOC;
 
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
-		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
-		    EXTENT_SET_PRIVATE2)))
+	if (page_ops == 0)
 		return 0;
 
 	while (nr_pages > 0) {
@@ -1402,20 +1749,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 
-			if (op & EXTENT_SET_PRIVATE2)
+			if (page_ops & PAGE_SET_PRIVATE2)
 				SetPagePrivate2(pages[i]);
 
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
 			}
-			if (op & EXTENT_CLEAR_DIRTY)
+			if (page_ops & PAGE_CLEAR_DIRTY)
 				clear_page_dirty_for_io(pages[i]);
-			if (op & EXTENT_SET_WRITEBACK)
+			if (page_ops & PAGE_SET_WRITEBACK)
 				set_page_writeback(pages[i]);
-			if (op & EXTENT_END_WRITEBACK)
+			if (page_ops & PAGE_END_WRITEBACK)
 				end_page_writeback(pages[i]);
-			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+			if (page_ops & PAGE_UNLOCK)
 				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
@@ -1442,10 +1789,8 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	u64 last = 0;
 	int found = 0;
 
-	if (search_end <= cur_start) {
-		WARN_ON(1);
+	if (WARN_ON(search_end <= cur_start))
 		return 0;
-	}
 
 	spin_lock(&tree->lock);
 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
@@ -1472,7 +1817,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 			if (total_bytes >= max_bytes)
 				break;
 			if (!found) {
-				*start = state->start;
+				*start = max(cur_start, state->start);
 				found = 1;
 			}
 			last = state->end;
@@ -1492,7 +1837,7 @@ out:
  * set the private field for a given byte offset in the tree.  If there isn't
  * an extent_state there already, this does nothing.
  */
-int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1553,14 +1898,15 @@ out:
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled, struct extent_state *cached)
+		   unsigned long bits, int filled, struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
 	int bitset = 0;
 
 	spin_lock(&tree->lock);
-	if (cached && cached->tree && cached->start == start)
+	if (cached && cached->tree && cached->start <= start &&
+	    cached->end > start)
 		node = &cached->rb_node;
 	else
 		node = tree_search(tree, start);
@@ -1605,43 +1951,429 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
-static int check_page_uptodate(struct extent_io_tree *tree,
-			       struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
-	return 0;
 }
 
 /*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
  */
-static int check_page_locked(struct extent_io_tree *tree,
-			     struct page *page)
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int this_mirror;
+	int failed_mirror;
+	int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+				int did_repair)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
-		unlock_page(page);
-	return 0;
+	int ret;
+	int err = 0;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+	set_state_private(failure_tree, rec->start, 0);
+	ret = clear_extent_bits(failure_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+	if (ret)
+		err = ret;
+
+	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_DAMAGED, GFP_NOFS);
+	if (ret && !err)
+		err = ret;
+
+	kfree(rec);
+	return err;
 }
 
 /*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchronization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
  */
-static int check_page_writeback(struct extent_io_tree *tree,
-			     struct page *page)
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num)
 {
-	end_page_writeback(page);
+	struct bio *bio;
+	struct btrfs_device *dev;
+	u64 map_length = 0;
+	u64 sector;
+	struct btrfs_bio *bbio = NULL;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	int ret;
+
+	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+	BUG_ON(!mirror_num);
+
+	/* we can't repair anything in raid56 yet */
+	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+		return 0;
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio)
+		return -EIO;
+	bio->bi_iter.bi_size = 0;
+	map_length = length;
+
+	ret = btrfs_map_block(fs_info, WRITE, logical,
+			      &map_length, &bbio, mirror_num);
+	if (ret) {
+		bio_put(bio);
+		return -EIO;
+	}
+	BUG_ON(mirror_num != bbio->mirror_num);
+	sector = bbio->stripes[mirror_num-1].physical >> 9;
+	bio->bi_iter.bi_sector = sector;
+	dev = bbio->stripes[mirror_num-1].dev;
+	kfree(bbio);
+	if (!dev || !dev->bdev || !dev->writeable) {
+		bio_put(bio);
+		return -EIO;
+	}
+	bio->bi_bdev = dev->bdev;
+	bio_add_page(bio, page, length, start - page_offset(page));
+
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
+		/* try to remap that extent elsewhere? */
+		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		return -EIO;
+	}
+
+	printk_ratelimited_in_rcu(KERN_INFO
+			"BTRFS: read error corrected: ino %lu off %llu "
+		    "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+		    start, rcu_str_deref(dev->name), sector);
+
+	bio_put(bio);
 	return 0;
 }
 
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+			 int mirror_num)
+{
+	u64 start = eb->start;
+	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
+					start, p, mirror_num);
+		if (ret)
+			break;
+		start += PAGE_CACHE_SIZE;
+	}
+
+	return ret;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failrec;
+	struct inode *inode = page->mapping->host;
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_state *state;
+	int num_copies;
+	int did_repair = 0;
+	int ret;
+
+	private = 0;
+	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+				(u64)-1, 1, EXTENT_DIRTY, 0);
+	if (!ret)
+		return 0;
+
+	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+				&private_failure);
+	if (ret)
+		return 0;
+
+	failrec = (struct io_failure_record *)(unsigned long) private_failure;
+	BUG_ON(!failrec->this_mirror);
+
+	if (failrec->in_validation) {
+		/* there was no real error, just free the record */
+		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+			 failrec->start);
+		did_repair = 1;
+		goto out;
+	}
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		goto out;
+
+	spin_lock(&BTRFS_I(inode)->io_tree.lock);
+	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+					    failrec->start,
+					    EXTENT_LOCKED);
+	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+	if (state && state->start <= failrec->start &&
+	    state->end >= failrec->start + failrec->len - 1) {
+		num_copies = btrfs_num_copies(fs_info, failrec->logical,
+					      failrec->len);
+		if (num_copies > 1)  {
+			ret = repair_io_failure(fs_info, start, failrec->len,
+						failrec->logical, page,
+						failrec->failed_mirror);
+			did_repair = !ret;
+		}
+		ret = 0;
+	}
+
+out:
+	if (!ret)
+		ret = free_io_failure(inode, failrec, did_repair);
+
+	return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+			      struct page *page, u64 start, u64 end,
+			      int failed_mirror)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct bio *bio;
+	struct btrfs_io_bio *btrfs_failed_bio;
+	struct btrfs_io_bio *btrfs_bio;
+	int num_copies;
+	int ret;
+	int read_mode;
+	u64 logical;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->this_mirror = 0;
+		failrec->bio_flags = 0;
+		failrec->in_validation = 0;
+
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (!em) {
+			read_unlock(&em_tree->lock);
+			kfree(failrec);
+			return -EIO;
+		}
+
+		if (em->start > start || em->start + em->len <= start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		read_unlock(&em_tree->lock);
+
+		if (!em) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
+		}
+		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+			 "len=%llu\n", logical, start, failrec->len);
+		failrec->logical = logical;
+		free_extent_map(em);
+
+		/* set the bits in the private failure tree */
+		ret = set_extent_bits(failure_tree, start, end,
+					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		if (ret >= 0)
+			ret = set_state_private(failure_tree, start,
+						(u64)(unsigned long)failrec);
+		/* set the bits in the inode's tree */
+		if (ret >= 0)
+			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+						GFP_NOFS);
+		if (ret < 0) {
+			kfree(failrec);
+			return ret;
+		}
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+		pr_debug("bio_readpage_error: (found) logical=%llu, "
+			 "start=%llu, len=%llu, validation=%d\n",
+			 failrec->logical, failrec->start, failrec->len,
+			 failrec->in_validation);
+		/*
+		 * when data can be on disk more than twice, add to failrec here
+		 * (e.g. with a list for failed_mirror) to make
+		 * clean_io_failure() clean all those errors at once.
+		 */
+	}
+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+				      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	/*
+	 * there are two premises:
+	 *	a) deliver good data to the caller
+	 *	b) correct the bad sectors on disk
+	 */
+	if (failed_bio->bi_vcnt > 1) {
+		/*
+		 * to fulfill b), we need to know the exact failing sectors, as
+		 * we don't want to rewrite any more than the failed ones. thus,
+		 * we need separate read requests for the failed bio
+		 *
+		 * if the following BUG_ON triggers, our validation request got
+		 * merged. we need separate requests for our algorithm to work.
+		 */
+		BUG_ON(failrec->in_validation);
+		failrec->in_validation = 1;
+		failrec->this_mirror = failed_mirror;
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	} else {
+		/*
+		 * we're ready to fulfill a) and b) alongside. get a good copy
+		 * of the failed sector and if we succeed, we have setup
+		 * everything for repair_io_failure to do the rest for us.
+		 */
+		if (failrec->in_validation) {
+			BUG_ON(failrec->this_mirror != failed_mirror);
+			failrec->in_validation = 0;
+			failrec->this_mirror = 0;
+		}
+		failrec->failed_mirror = failed_mirror;
+		failrec->this_mirror++;
+		if (failrec->this_mirror == failed_mirror)
+			failrec->this_mirror++;
+		read_mode = READ_SYNC;
+	}
+
+	if (failrec->this_mirror > num_copies) {
+		pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_iter.bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+	bio->bi_iter.bi_size = 0;
+
+	btrfs_failed_bio = btrfs_io_bio(failed_bio);
+	if (btrfs_failed_bio->csum) {
+		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = btrfs_bio->csum_inline;
+		phy_offset >>= inode->i_sb->s_blocksize_bits;
+		phy_offset *= csum_size;
+		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
+		       csum_size);
+	}
+
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+	pr_debug("bio_readpage_error: submitting new read[%#x] to "
+		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+		 failrec->this_mirror, num_copies, failrec->in_validation);
+
+	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
+					 failrec->this_mirror,
+					 failrec->bio_flags, 0);
+	return ret;
+}
+
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+	int uptodate = (err == 0);
+	struct extent_io_tree *tree;
+	int ret = 0;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (tree->ops && tree->ops->writepage_end_io_hook) {
+		ret = tree->ops->writepage_end_io_hook(page, start,
+					       end, NULL, uptodate);
+		if (ret)
+			uptodate = 0;
+	}
+
+	if (!uptodate) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+		ret = ret < 0 ? ret : -EIO;
+		mapping_set_error(page->mapping, ret);
+	}
+	return 0;
+}
+
 /*
  * after a writepage IO is done, we need to:
  * clear the uptodate bits on error
@@ -1653,61 +2385,55 @@ static int check_page_writeback(struct extent_io_tree *tree,
  */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
-	int uptodate = err == 0;
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_io_tree *tree;
+	struct bio_vec *bvec;
 	u64 start;
 	u64 end;
-	int whole_page;
-	int ret;
+	int i;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
-		tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			 bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
 
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-		if (tree->ops && tree->ops->writepage_end_io_hook) {
-			ret = tree->ops->writepage_end_io_hook(page, start,
-						       end, NULL, uptodate);
-			if (ret)
-				uptodate = 0;
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "partial page write in btrfs with offset %u and length %u",
+					bvec->bv_offset, bvec->bv_len);
+			else
+				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "incomplete page write in btrfs with offset %u and "
+				   "length %u",
+					bvec->bv_offset, bvec->bv_len);
 		}
 
-		if (!uptodate && tree->ops &&
-		    tree->ops->writepage_io_failed_hook) {
-			ret = tree->ops->writepage_io_failed_hook(bio, page,
-							 start, end, NULL);
-			if (ret == 0) {
-				uptodate = (err == 0);
-				continue;
-			}
-		}
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
+		if (end_extent_writepage(page, err, start, end))
+			continue;
 
-		if (whole_page)
-			end_page_writeback(page);
-		else
-			check_page_writeback(tree, page);
-	} while (bvec >= bio->bi_io_vec);
+		end_page_writeback(page);
+	}
 
 	bio_put(bio);
 }
 
+static void
+endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
+			      int uptodate)
+{
+	struct extent_state *cached = NULL;
+	u64 end = start + len - 1;
+
+	if (uptodate && tree->track_uptodate)
+		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
+	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+}
+
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -1721,44 +2447,86 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
  */
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
+	struct bio_vec *bvec;
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree;
+	u64 offset = 0;
 	u64 start;
 	u64 end;
-	int whole_page;
+	u64 len;
+	u64 extent_start = 0;
+	u64 extent_len = 0;
+	int mirror;
 	int ret;
+	int i;
 
 	if (err)
 		uptodate = 0;
 
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
-		tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
+		struct inode *inode = page->mapping->host;
+
+		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+			 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
+			 io_bio->mirror_num);
+		tree = &BTRFS_I(inode)->io_tree;
+
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "partial page read in btrfs with offset %u and length %u",
+					bvec->bv_offset, bvec->bv_len);
+			else
+				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+				   "incomplete page read in btrfs with offset %u and "
+				   "length %u",
+					bvec->bv_offset, bvec->bv_len);
+		}
 
-		if (++bvec <= bvec_end)
-			prefetchw(&bvec->bv_page->flags);
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
+		len = bvec->bv_len;
 
-		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
-			ret = tree->ops->readpage_end_io_hook(page, start, end,
-							      NULL);
+		mirror = io_bio->mirror_num;
+		if (likely(uptodate && tree->ops &&
+			   tree->ops->readpage_end_io_hook)) {
+			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
+							      page, start, end,
+							      mirror);
 			if (ret)
 				uptodate = 0;
+			else
+				clean_io_failure(start, page);
 		}
-		if (!uptodate && tree->ops &&
-		    tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+
+		if (likely(uptodate))
+			goto readpage_ok;
+
+		if (tree->ops && tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(page, mirror);
+			if (!ret && !err &&
+			    test_bit(BIO_UPTODATE, &bio->bi_flags))
+				uptodate = 1;
+		} else {
+			/*
+			 * The generic bio_readpage_error handles errors the
+			 * following way: If possible, new read requests are
+			 * created and submitted and will end up in
+			 * end_bio_extent_readpage as well (if we're lucky, not
+			 * in the !uptodate case). In that case it returns 0 and
+			 * we just go on with the next page in our bio. If it
+			 * can't handle the error it will return -EIO and we
+			 * remain responsible for that page.
+			 */
+			ret = bio_readpage_error(bio, offset, page, start, end,
+						 mirror);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1767,96 +2535,111 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 				continue;
 			}
 		}
-
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end,
-					    GFP_ATOMIC);
+readpage_ok:
+		if (likely(uptodate)) {
+			loff_t i_size = i_size_read(inode);
+			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+			unsigned offset;
+
+			/* Zero out the end if this page straddles i_size */
+			offset = i_size & (PAGE_CACHE_SIZE-1);
+			if (page->index == end_index && offset)
+				zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
 		}
-		unlock_extent(tree, start, end, GFP_ATOMIC);
-
-		if (whole_page) {
-			if (uptodate) {
-				SetPageUptodate(page);
-			} else {
-				ClearPageUptodate(page);
-				SetPageError(page);
+		unlock_page(page);
+		offset += len;
+
+		if (unlikely(!uptodate)) {
+			if (extent_len) {
+				endio_readpage_release_extent(tree,
+							      extent_start,
+							      extent_len, 1);
+				extent_start = 0;
+				extent_len = 0;
 			}
-			unlock_page(page);
+			endio_readpage_release_extent(tree, start,
+						      end - start + 1, 0);
+		} else if (!extent_len) {
+			extent_start = start;
+			extent_len = end + 1 - start;
+		} else if (extent_start + extent_len == start) {
+			extent_len += end + 1 - start;
 		} else {
-			if (uptodate) {
-				check_page_uptodate(tree, page);
-			} else {
-				ClearPageUptodate(page);
-				SetPageError(page);
-			}
-			check_page_locked(tree, page);
+			endio_readpage_release_extent(tree, extent_start,
+						      extent_len, uptodate);
+			extent_start = start;
+			extent_len = end + 1 - start;
 		}
-	} while (bvec <= bvec_end);
+	}
 
+	if (extent_len)
+		endio_readpage_release_extent(tree, extent_start, extent_len,
+					      uptodate);
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, err);
 	bio_put(bio);
 }
 
 /*
- * IO done from prepare_write is pretty simple, we just unlock
- * the structs in the extent tree when done, and set the uptodate bits
- * as appropriate.
+ * this allocates from the btrfs_bioset.  We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
  */
-static void end_bio_extent_preparewrite(struct bio *bio, int err)
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_io_tree *tree;
-	u64 start;
-	u64 end;
-
-	do {
-		struct page *page = bvec->bv_page;
-		tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-
-		unlock_extent(tree, start, end, GFP_ATOMIC);
-
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-}
-
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags)
 {
+	struct btrfs_io_bio *btrfs_bio;
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, nr_vecs);
+	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
 
 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(gfp_flags, nr_vecs);
+		while (!bio && (nr_vecs /= 2)) {
+			bio = bio_alloc_bioset(gfp_flags,
+					       nr_vecs, btrfs_bioset);
+		}
 	}
 
 	if (bio) {
-		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
-		bio->bi_sector = first_sector;
+		bio->bi_iter.bi_sector = first_sector;
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
 	}
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
-			  unsigned long bio_flags)
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+	return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+}
+
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	struct btrfs_io_bio *btrfs_bio;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+	if (bio) {
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
+	}
+	return bio;
+}
+
+
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+				       int mirror_num, unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1864,7 +2647,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 	struct extent_io_tree *tree = bio->bi_private;
 	u64 start;
 
-	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+	start = page_offset(page) + bvec->bv_offset;
 
 	bio->bi_private = NULL;
 
@@ -1874,13 +2657,27 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
+
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
 
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
+		     unsigned long offset, size_t size, struct bio *bio,
+		     unsigned long bio_flags)
+{
+	int ret = 0;
+	if (tree->ops && tree->ops->merge_bio_hook)
+		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
+						bio_flags);
+	BUG_ON(ret < 0);
+	return ret;
+
+}
+
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
@@ -1903,18 +2700,17 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
 		if (old_compressed)
-			contig = bio->bi_sector == sector;
+			contig = bio->bi_iter.bi_sector == sector;
 		else
-			contig = bio->bi_sector + (bio->bi_size >> 9) ==
-				sector;
+			contig = bio_end_sector(bio) == sector;
 
 		if (prev_bio_flags != bio_flags || !contig ||
-		    (tree->ops && tree->ops->merge_bio_hook &&
-		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
-					       bio_flags)) ||
+		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
 		    bio_add_page(bio, page, page_size, offset) < page_size) {
 			ret = submit_one_bio(rw, bio, mirror_num,
 					     prev_bio_flags);
+			if (ret < 0)
+				return ret;
 			bio = NULL;
 		} else {
 			return 0;
@@ -1941,6 +2737,18 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	return ret;
 }
 
+static void attach_extent_buffer_page(struct extent_buffer *eb,
+				      struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long)eb);
+	} else {
+		WARN_ON(page->private != (unsigned long)eb);
+	}
+}
+
 void set_page_extent_mapped(struct page *page)
 {
 	if (!PagePrivate(page)) {
@@ -1950,25 +2758,48 @@ void set_page_extent_mapped(struct page *page)
 	}
 }
 
-static void set_page_extent_head(struct page *page, unsigned long len)
+static struct extent_map *
+__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+		 u64 start, u64 len, get_extent_t *get_extent,
+		 struct extent_map **em_cached)
 {
-	WARN_ON(!PagePrivate(page));
-	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
-}
+	struct extent_map *em;
+
+	if (em_cached && *em_cached) {
+		em = *em_cached;
+		if (extent_map_in_tree(em) && start >= em->start &&
+		    start < extent_map_end(em)) {
+			atomic_inc(&em->refs);
+			return em;
+		}
+
+		free_extent_map(em);
+		*em_cached = NULL;
+	}
 
+	em = get_extent(inode, page, pg_offset, start, len, 0);
+	if (em_cached && !IS_ERR_OR_NULL(em)) {
+		BUG_ON(*em_cached);
+		atomic_inc(&em->refs);
+		*em_cached = em;
+	}
+	return em;
+}
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
  * handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
  */
-static int __extent_read_full_page(struct extent_io_tree *tree,
-				   struct page *page,
-				   get_extent_t *get_extent,
-				   struct bio **bio, int mirror_num,
-				   unsigned long *bio_flags)
+static int __do_readpage(struct extent_io_tree *tree,
+			 struct page *page,
+			 get_extent_t *get_extent,
+			 struct extent_map **em_cached,
+			 struct bio **bio, int mirror_num,
+			 unsigned long *bio_flags, int rw)
 {
 	struct inode *inode = page->mapping->host;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
@@ -1979,26 +2810,24 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
-	struct btrfs_ordered_extent *ordered;
 	int ret;
 	int nr = 0;
-	size_t page_offset = 0;
+	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+	size_t pg_offset = 0;
 	size_t iosize;
 	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	unsigned long this_bio_flag = 0;
+	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
 
 	set_page_extent_mapped(page);
 
 	end = page_end;
-	while (1) {
-		lock_extent(tree, start, end, GFP_NOFS);
-		ordered = btrfs_lookup_ordered_extent(inode, start);
-		if (!ordered)
-			break;
-		unlock_extent(tree, start, end, GFP_NOFS);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
+	if (!PageUptodate(page)) {
+		if (cleancache_get_page(page) == 0) {
+			BUG_ON(blocksize != PAGE_SIZE);
+			unlock_extent(tree, start, end);
+			goto out;
+		}
 	}
 
 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
@@ -2007,30 +2836,38 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
 		if (zero_offset) {
 			iosize = PAGE_CACHE_SIZE - zero_offset;
-			userpage = kmap_atomic(page, KM_USER0);
+			userpage = kmap_atomic(page);
 			memset(userpage + zero_offset, 0, iosize);
 			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
+			kunmap_atomic(userpage);
 		}
 	}
 	while (cur <= end) {
+		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
 		if (cur >= last_byte) {
 			char *userpage;
-			iosize = PAGE_CACHE_SIZE - page_offset;
-			userpage = kmap_atomic(page, KM_USER0);
-			memset(userpage + page_offset, 0, iosize);
+			struct extent_state *cached = NULL;
+
+			iosize = PAGE_CACHE_SIZE - pg_offset;
+			userpage = kmap_atomic(page);
+			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
+			kunmap_atomic(userpage);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+					    &cached, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent_cached(tree, cur,
+						     cur + iosize - 1,
+						     &cached, GFP_NOFS);
 			break;
 		}
-		em = get_extent(inode, page, page_offset, cur,
-				end - cur + 1, 0);
-		if (IS_ERR(em) || !em) {
+		em = __get_extent_map(inode, page, pg_offset, cur,
+				      end - cur + 1, get_extent, em_cached);
+		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
-			unlock_extent(tree, cur, end, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent(tree, cur, end);
 			break;
 		}
 		extent_offset = cur - em->start;
@@ -2038,14 +2875,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			this_bio_flag = EXTENT_BIO_COMPRESSED;
+			this_bio_flag |= EXTENT_BIO_COMPRESSED;
 			extent_set_compress_type(&this_bio_flag,
 						 em->compress_type);
 		}
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		iosize = ALIGN(iosize, blocksize);
 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
 			disk_io_size = em->block_len;
 			sector = em->block_start >> 9;
@@ -2063,25 +2900,29 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		/* we've found a hole, just zero and go on */
 		if (block_start == EXTENT_MAP_HOLE) {
 			char *userpage;
-			userpage = kmap_atomic(page, KM_USER0);
-			memset(userpage + page_offset, 0, iosize);
+			struct extent_state *cached = NULL;
+
+			userpage = kmap_atomic(page);
+			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
+			kunmap_atomic(userpage);
 
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+					    &cached, GFP_NOFS);
+			unlock_extent_cached(tree, cur, cur + iosize - 1,
+			                     &cached, GFP_NOFS);
 			cur = cur + iosize;
-			page_offset += iosize;
+			pg_offset += iosize;
 			continue;
 		}
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
-			page_offset += iosize;
+			pg_offset += iosize;
 			continue;
 		}
 		/* we have an inline extent but it didn't get marked up
@@ -2089,34 +2930,32 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		 */
 		if (block_start == EXTENT_MAP_INLINE) {
 			SetPageError(page);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
-			page_offset += iosize;
+			pg_offset += iosize;
 			continue;
 		}
 
-		ret = 0;
-		if (tree->ops && tree->ops->readpage_io_hook) {
-			ret = tree->ops->readpage_io_hook(page, cur,
-							  cur + iosize - 1);
-		}
-		if (!ret) {
-			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-			pnr -= page->index;
-			ret = submit_extent_page(READ, tree, page,
-					 sector, disk_io_size, page_offset,
+		pnr -= page->index;
+		ret = submit_extent_page(rw, tree, page,
+					 sector, disk_io_size, pg_offset,
 					 bdev, bio, pnr,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag);
+		if (!ret) {
 			nr++;
 			*bio_flags = this_bio_flag;
-		}
-		if (ret)
+		} else {
 			SetPageError(page);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
+		}
 		cur = cur + iosize;
-		page_offset += iosize;
+		pg_offset += iosize;
 	}
+out:
 	if (!nr) {
 		if (!PageError(page))
 			SetPageUptodate(page);
@@ -2125,17 +2964,129 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	return 0;
 }
 
+static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+					     struct page *pages[], int nr_pages,
+					     u64 start, u64 end,
+					     get_extent_t *get_extent,
+					     struct extent_map **em_cached,
+					     struct bio **bio, int mirror_num,
+					     unsigned long *bio_flags, int rw)
+{
+	struct inode *inode;
+	struct btrfs_ordered_extent *ordered;
+	int index;
+
+	inode = pages[0]->mapping->host;
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_range(inode, start,
+						     end - start + 1);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	for (index = 0; index < nr_pages; index++) {
+		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
+			      mirror_num, bio_flags, rw);
+		page_cache_release(pages[index]);
+	}
+}
+
+static void __extent_readpages(struct extent_io_tree *tree,
+			       struct page *pages[],
+			       int nr_pages, get_extent_t *get_extent,
+			       struct extent_map **em_cached,
+			       struct bio **bio, int mirror_num,
+			       unsigned long *bio_flags, int rw)
+{
+	u64 start = 0;
+	u64 end = 0;
+	u64 page_start;
+	int index;
+	int first_index = 0;
+
+	for (index = 0; index < nr_pages; index++) {
+		page_start = page_offset(pages[index]);
+		if (!end) {
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		} else if (end + 1 == page_start) {
+			end += PAGE_CACHE_SIZE;
+		} else {
+			__do_contiguous_readpages(tree, &pages[first_index],
+						  index - first_index, start,
+						  end, get_extent, em_cached,
+						  bio, mirror_num, bio_flags,
+						  rw);
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		}
+	}
+
+	if (end)
+		__do_contiguous_readpages(tree, &pages[first_index],
+					  index - first_index, start,
+					  end, get_extent, em_cached, bio,
+					  mirror_num, bio_flags, rw);
+}
+
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags, int rw)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_extent(inode, start);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+			    bio_flags, rw);
+	return ret;
+}
+
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			    get_extent_t *get_extent)
+			    get_extent_t *get_extent, int mirror_num)
 {
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
-				      &bio_flags);
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
+				      &bio_flags, READ);
 	if (bio)
-		ret = submit_one_bio(READ, bio, 0, bio_flags);
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+	return ret;
+}
+
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
+	int ret;
+
+	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
+				      &bio_flags, READ);
+	if (bio)
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
 	return ret;
 }
 
@@ -2150,135 +3101,143 @@ static noinline void update_nr_written(struct page *page,
 }
 
 /*
- * the writepage semantics are similar to regular writepage.  extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback.  Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent).  In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
  */
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      void *data)
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+			      struct page *page, struct writeback_control *wbc,
+			      struct extent_page_data *epd,
+			      u64 delalloc_start,
+			      unsigned long *nr_written)
 {
-	struct inode *inode = page->mapping->host;
-	struct extent_page_data *epd = data;
 	struct extent_io_tree *tree = epd->tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 delalloc_start;
+	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	u64 nr_delalloc;
+	u64 delalloc_to_write = 0;
+	u64 delalloc_end = 0;
+	int ret;
+	int page_started = 0;
+
+	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+		return 0;
+
+	while (delalloc_end < page_end) {
+		nr_delalloc = find_lock_delalloc_range(inode, tree,
+					       page,
+					       &delalloc_start,
+					       &delalloc_end,
+					       128 * 1024 * 1024);
+		if (nr_delalloc == 0) {
+			delalloc_start = delalloc_end + 1;
+			continue;
+		}
+		ret = tree->ops->fill_delalloc(inode, page,
+					       delalloc_start,
+					       delalloc_end,
+					       &page_started,
+					       nr_written);
+		/* File system has been set read-only */
+		if (ret) {
+			SetPageError(page);
+			/* fill_delalloc should be return < 0 for error
+			 * but just in case, we use > 0 here meaning the
+			 * IO is started, so we don't want to return > 0
+			 * unless things are going well.
+			 */
+			ret = ret < 0 ? ret : -EIO;
+			goto done;
+		}
+		/*
+		 * delalloc_end is already one less than the total
+		 * length, so we don't subtract one from
+		 * PAGE_CACHE_SIZE
+		 */
+		delalloc_to_write += (delalloc_end - delalloc_start +
+				      PAGE_CACHE_SIZE) >>
+				      PAGE_CACHE_SHIFT;
+		delalloc_start = delalloc_end + 1;
+	}
+	if (wbc->nr_to_write < delalloc_to_write) {
+		int thresh = 8192;
+
+		if (delalloc_to_write < thresh * 2)
+			thresh = delalloc_to_write;
+		wbc->nr_to_write = min_t(u64, delalloc_to_write,
+					 thresh);
+	}
+
+	/* did the fill delalloc function already unlock and start
+	 * the IO?
+	 */
+	if (page_started) {
+		/*
+		 * we've unlocked the page, so we can't update
+		 * the mapping's writeback index, just update
+		 * nr_to_write.
+		 */
+		wbc->nr_to_write -= *nr_written;
+		return 1;
+	}
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/*
+ * helper for __extent_writepage.  This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+				 struct page *page,
+				 struct writeback_control *wbc,
+				 struct extent_page_data *epd,
+				 loff_t i_size,
+				 unsigned long nr_written,
+				 int write_flags, int *nr_ret)
+{
+	struct extent_io_tree *tree = epd->tree;
+	u64 start = page_offset(page);
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
 	u64 extent_offset;
-	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
 	sector_t sector;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
 	struct block_device *bdev;
-	int ret;
-	int nr = 0;
 	size_t pg_offset = 0;
 	size_t blocksize;
-	loff_t i_size = i_size_read(inode);
-	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
-	u64 nr_delalloc;
-	u64 delalloc_end;
-	int page_started;
-	int compressed;
-	int write_flags;
-	unsigned long nr_written = 0;
-
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC_PLUG;
-	else
-		write_flags = WRITE;
-
-	WARN_ON(!PageLocked(page));
-	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
-	if (page->index > end_index ||
-	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
-		unlock_page(page);
-		return 0;
-	}
-
-	if (page->index == end_index) {
-		char *userpage;
-
-		userpage = kmap_atomic(page, KM_USER0);
-		memset(userpage + pg_offset, 0,
-		       PAGE_CACHE_SIZE - pg_offset);
-		kunmap_atomic(userpage, KM_USER0);
-		flush_dcache_page(page);
-	}
-	pg_offset = 0;
-
-	set_page_extent_mapped(page);
-
-	delalloc_start = start;
-	delalloc_end = 0;
-	page_started = 0;
-	if (!epd->extent_locked) {
-		u64 delalloc_to_write = 0;
-		/*
-		 * make sure the wbc mapping index is at least updated
-		 * to this page.
-		 */
-		update_nr_written(page, wbc, 0);
-
-		while (delalloc_end < page_end) {
-			nr_delalloc = find_lock_delalloc_range(inode, tree,
-						       page,
-						       &delalloc_start,
-						       &delalloc_end,
-						       128 * 1024 * 1024);
-			if (nr_delalloc == 0) {
-				delalloc_start = delalloc_end + 1;
-				continue;
-			}
-			tree->ops->fill_delalloc(inode, page, delalloc_start,
-						 delalloc_end, &page_started,
-						 &nr_written);
-			/*
-			 * delalloc_end is already one less than the total
-			 * length, so we don't subtract one from
-			 * PAGE_CACHE_SIZE
-			 */
-			delalloc_to_write += (delalloc_end - delalloc_start +
-					      PAGE_CACHE_SIZE) >>
-					      PAGE_CACHE_SHIFT;
-			delalloc_start = delalloc_end + 1;
-		}
-		if (wbc->nr_to_write < delalloc_to_write) {
-			int thresh = 8192;
-
-			if (delalloc_to_write < thresh * 2)
-				thresh = delalloc_to_write;
-			wbc->nr_to_write = min_t(u64, delalloc_to_write,
-						 thresh);
-		}
+	int ret = 0;
+	int nr = 0;
+	bool compressed;
 
-		/* did the fill delalloc function already unlock and start
-		 * the IO?
-		 */
-		if (page_started) {
-			ret = 0;
-			/*
-			 * we've unlocked the page, so we can't update
-			 * the mapping's writeback index, just update
-			 * nr_to_write.
-			 */
-			wbc->nr_to_write -= nr_written;
-			goto done_unlocked;
-		}
-	}
 	if (tree->ops && tree->ops->writepage_start_hook) {
 		ret = tree->ops->writepage_start_hook(page, start,
 						      page_end);
-		if (ret == -EAGAIN) {
-			redirty_page_for_writepage(wbc, page);
+		if (ret) {
+			/* Fixup worker will requeue */
+			if (ret == -EBUSY)
+				wbc->pages_skipped++;
+			else
+				redirty_page_for_writepage(wbc, page);
+
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
-			ret = 0;
+			ret = 1;
 			goto done_unlocked;
 		}
 	}
@@ -2290,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	update_nr_written(page, wbc, nr_written + 1);
 
 	end = page_end;
-	if (last_byte <= start) {
+	if (i_size <= start) {
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
@@ -2300,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	blocksize = inode->i_sb->s_blocksize;
 
 	while (cur <= end) {
-		if (cur >= last_byte) {
+		u64 em_end;
+		if (cur >= i_size) {
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
@@ -2308,16 +3268,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		}
 		em = epd->get_extent(inode, page, pg_offset, cur,
 				     end - cur + 1, 1);
-		if (IS_ERR(em) || !em) {
+		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
+			ret = PTR_ERR_OR_ZERO(em);
 			break;
 		}
 
 		extent_offset = cur - em->start;
-		BUG_ON(extent_map_end(em) <= cur);
+		em_end = extent_map_end(em);
+		BUG_ON(em_end <= cur);
 		BUG_ON(end < cur);
-		iosize = min(extent_map_end(em) - cur, end - cur + 1);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		iosize = min(em_end - cur, end - cur + 1);
+		iosize = ALIGN(iosize, blocksize);
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
@@ -2352,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			pg_offset += iosize;
 			continue;
 		}
-		/* leave this out until we have a page_mkwrite call */
-		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-				   EXTENT_DIRTY, 0, NULL)) {
-			cur = cur + iosize;
-			pg_offset += iosize;
-			continue;
-		}
 
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
@@ -2369,14 +3324,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (ret) {
 			SetPageError(page);
 		} else {
-			unsigned long max_nr = end_index + 1;
+			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
-				printk(KERN_ERR "btrfs warning page %lu not "
-				       "writeback, cur %llu end %llu\n",
-				       page->index, (unsigned long long)cur,
-				       (unsigned long long)end);
+				btrfs_err(BTRFS_I(inode)->root->fs_info,
+					   "page %lu not writeback, cur %llu end %llu",
+				       page->index, cur, end);
 			}
 
 			ret = submit_extent_page(write_flags, tree, page,
@@ -2392,20 +3346,391 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		nr++;
 	}
 done:
+	*nr_ret = nr;
+
+done_unlocked:
+
+	/* drop our reference on any cached states */
+	free_extent_state(cached_state);
+	return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	u64 start = page_offset(page);
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+	int nr = 0;
+	size_t pg_offset = 0;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	int write_flags;
+	unsigned long nr_written = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC;
+	else
+		write_flags = WRITE;
+
+	trace___extent_writepage(page, inode, wbc);
+
+	WARN_ON(!PageLocked(page));
+
+	ClearPageError(page);
+
+	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if (page->index > end_index ||
+	   (page->index == end_index && !pg_offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		userpage = kmap_atomic(page);
+		memset(userpage + pg_offset, 0,
+		       PAGE_CACHE_SIZE - pg_offset);
+		kunmap_atomic(userpage);
+		flush_dcache_page(page);
+	}
+
+	pg_offset = 0;
+
+	set_page_extent_mapped(page);
+
+	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+	if (ret == 1)
+		goto done_unlocked;
+	if (ret)
+		goto done;
+
+	ret = __extent_writepage_io(inode, page, wbc, epd,
+				    i_size, nr_written, write_flags, &nr);
+	if (ret == 1)
+		goto done_unlocked;
+
+done:
 	if (nr == 0) {
 		/* make sure the mapping tag for page dirty gets cleared */
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
+	if (PageError(page)) {
+		ret = ret < 0 ? ret : -EIO;
+		end_extent_writepage(page, ret, start, page_end);
+	}
 	unlock_page(page);
+	return ret;
 
 done_unlocked:
+	return 0;
+}
 
-	/* drop our reference on any cached states */
-	free_extent_state(cached_state);
+static int eb_wait(void *word)
+{
+	io_schedule();
 	return 0;
 }
 
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+		    TASK_UNINTERRUPTIBLE);
+}
+
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+			  struct btrfs_fs_info *fs_info,
+			  struct extent_page_data *epd)
+{
+	unsigned long i, num_pages;
+	int flush = 0;
+	int ret = 0;
+
+	if (!btrfs_try_tree_write_lock(eb)) {
+		flush = 1;
+		flush_write_bio(epd);
+		btrfs_tree_lock(eb);
+	}
+
+	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+		btrfs_tree_unlock(eb);
+		if (!epd->sync_io)
+			return 0;
+		if (!flush) {
+			flush_write_bio(epd);
+			flush = 1;
+		}
+		while (1) {
+			wait_on_extent_buffer_writeback(eb);
+			btrfs_tree_lock(eb);
+			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+				break;
+			btrfs_tree_unlock(eb);
+		}
+	}
+
+	/*
+	 * We need to do this to prevent races in people who check if the eb is
+	 * under IO since we can end up having no IO bits set for a short period
+	 * of time.
+	 */
+	spin_lock(&eb->refs_lock);
+	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+		spin_unlock(&eb->refs_lock);
+		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+				     -eb->len,
+				     fs_info->dirty_metadata_batch);
+		ret = 1;
+	} else {
+		spin_unlock(&eb->refs_lock);
+	}
+
+	btrfs_tree_unlock(eb);
+
+	if (!ret)
+		return ret;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+
+		if (!trylock_page(p)) {
+			if (!flush) {
+				flush_write_bio(epd);
+				flush = 1;
+			}
+			lock_page(p);
+		}
+	}
+
+	return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+	smp_mb__after_atomic();
+	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	struct extent_buffer *eb;
+	int i, done;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		eb = (struct extent_buffer *)page->private;
+		BUG_ON(!eb);
+		done = atomic_dec_and_test(&eb->io_pages);
+
+		if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		end_page_writeback(page);
+
+		if (!done)
+			continue;
+
+		end_extent_buffer_writeback(eb);
+	}
+
+	bio_put(bio);
+}
+
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+			struct btrfs_fs_info *fs_info,
+			struct writeback_control *wbc,
+			struct extent_page_data *epd)
+{
+	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+	u64 offset = eb->start;
+	unsigned long i, num_pages;
+	unsigned long bio_flags = 0;
+	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
+	int ret = 0;
+
+	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	atomic_set(&eb->io_pages, num_pages);
+	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+		bio_flags = EXTENT_BIO_TREE_LOG;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+
+		clear_page_dirty_for_io(p);
+		set_page_writeback(p);
+		ret = submit_extent_page(rw, tree, p, offset >> 9,
+					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+					 -1, end_bio_extent_buffer_writepage,
+					 0, epd->bio_flags, bio_flags);
+		epd->bio_flags = bio_flags;
+		if (ret) {
+			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+			SetPageError(p);
+			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+				end_extent_buffer_writeback(eb);
+			ret = -EIO;
+			break;
+		}
+		offset += PAGE_CACHE_SIZE;
+		update_nr_written(p, wbc, 1);
+		unlock_page(p);
+	}
+
+	if (unlikely(ret)) {
+		for (; i < num_pages; i++) {
+			struct page *p = extent_buffer_page(eb, i);
+			unlock_page(p);
+		}
+	}
+
+	return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+				   struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+	struct extent_buffer *eb, *prev_eb = NULL;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
+	};
+	int ret = 0;
+	int done = 0;
+	int nr_to_write_done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int tag;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		scanned = 1;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
+	while (!done && !nr_to_write_done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (!PagePrivate(page))
+				continue;
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				break;
+			}
+
+			spin_lock(&mapping->private_lock);
+			if (!PagePrivate(page)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			eb = (struct extent_buffer *)page->private;
+
+			/*
+			 * Shouldn't happen and normally this would be a BUG_ON
+			 * but no sense in crashing the users box for something
+			 * we can survive anyway.
+			 */
+			if (WARN_ON(!eb)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			if (eb == prev_eb) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			ret = atomic_inc_not_zero(&eb->refs);
+			spin_unlock(&mapping->private_lock);
+			if (!ret)
+				continue;
+
+			prev_eb = eb;
+			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+			if (!ret) {
+				free_extent_buffer(eb);
+				continue;
+			}
+
+			ret = write_one_eb(eb, fs_info, wbc, &epd);
+			if (ret) {
+				done = 1;
+				free_extent_buffer(eb);
+				break;
+			}
+			free_extent_buffer(eb);
+
+			/*
+			 * the filesystem may choose to bump up nr_to_write.
+			 * We have to make sure to honor the new nr_to_write
+			 * at any time
+			 */
+			nr_to_write_done = wbc->nr_to_write <= 0;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	flush_write_bio(&epd);
+	return ret;
+}
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -2427,14 +3752,29 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     writepage_t writepage, void *data,
 			     void (*flush_fn)(void *))
 {
+	struct inode *inode = mapping->host;
 	int ret = 0;
 	int done = 0;
+	int err = 0;
 	int nr_to_write_done = 0;
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
+	int tag;
+
+	/*
+	 * We have to hold onto the inode so that ordered extents can do their
+	 * work when the IO finishes.  The alternative to this is failing to add
+	 * an ordered extent if the igrab() fails there and that is a huge pain
+	 * to deal with, so instead just hold onto the inode throughout the
+	 * writepages operation.  If it fails here we are freeing up the inode
+	 * anyway and we'd rather not waste our time writing out stuff that is
+	 * going to be truncated anyway.
+	 */
+	if (!igrab(inode))
+		return 0;
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
@@ -2445,11 +3785,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		scanned = 1;
 	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
 retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			      PAGECACHE_TAG_DIRTY, min(end - index,
-				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
@@ -2463,10 +3808,10 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
-				tree->ops->write_cache_pages_lock_hook(page);
-			else
+			if (!trylock_page(page)) {
+				flush_fn(data);
 				lock_page(page);
+			}
 
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
@@ -2497,8 +3842,8 @@ retry:
 				unlock_page(page);
 				ret = 0;
 			}
-			if (ret)
-				done = 1;
+			if (!err && ret < 0)
+				err = ret;
 
 			/*
 			 * the filesystem may choose to bump up nr_to_write.
@@ -2510,7 +3855,7 @@ retry:
 		pagevec_release(&pvec);
 		cond_resched();
 	}
-	if (!scanned && !done) {
+	if (!scanned && !done && !err) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
@@ -2519,16 +3864,21 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	return ret;
+	btrfs_add_delayed_iput(inode);
+	return err;
 }
 
 static void flush_epd_write_bio(struct extent_page_data *epd)
 {
 	if (epd->bio) {
+		int rw = WRITE;
+		int ret;
+
 		if (epd->sync_io)
-			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
-		else
-			submit_one_bio(WRITE, epd->bio, 0, 0);
+			rw = WRITE_SYNC;
+
+		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
+		BUG_ON(ret < 0); /* -ENOMEM */
 		epd->bio = NULL;
 	}
 }
@@ -2544,26 +3894,17 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  struct writeback_control *wbc)
 {
 	int ret;
-	struct address_space *mapping = page->mapping;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
-	};
-	struct writeback_control wbc_writepages = {
-		.sync_mode	= wbc->sync_mode,
-		.older_than_this = NULL,
-		.nr_to_write	= 64,
-		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
-		.range_end	= (loff_t)-1,
+		.bio_flags = 0,
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
 
-	extent_write_cache_pages(tree, mapping, &wbc_writepages,
-				 __extent_writepage, &epd, flush_write_bio);
 	flush_epd_write_bio(&epd);
 	return ret;
 }
@@ -2584,10 +3925,10 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
-		.older_than_this = NULL,
 		.nr_to_write	= nr_pages * 2,
 		.range_start	= start,
 		.range_end	= end + 1,
@@ -2624,6 +3965,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -2641,22 +3983,39 @@ int extent_readpages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	unsigned long bio_flags = 0;
+	struct page *pagepool[16];
+	struct page *page;
+	struct extent_map *em_cached = NULL;
+	int nr = 0;
 
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
+		page = list_entry(pages->prev, struct page, lru);
 
 		prefetchw(&page->flags);
 		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
-					page->index, GFP_KERNEL)) {
-			__extent_read_full_page(tree, page, get_extent,
-						&bio, 0, &bio_flags);
+		if (add_to_page_cache_lru(page, mapping,
+					page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			continue;
 		}
-		page_cache_release(page);
+
+		pagepool[nr++] = page;
+		if (nr < ARRAY_SIZE(pagepool))
+			continue;
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+		nr = 0;
 	}
+	if (nr)
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+
+	if (em_cached)
+		free_extent_map(em_cached);
+
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio, 0, bio_flags);
+		return submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
 
@@ -2669,15 +4028,15 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset)
 {
 	struct extent_state *cached_state = NULL;
-	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-	start += (offset + blocksize - 1) & ~(blocksize - 1);
+	start += ALIGN(offset, blocksize);
 	if (start > end)
 		return 0;
 
-	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+	lock_extent_bits(tree, start, end, 0, &cached_state);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -2687,134 +4046,15 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 }
 
 /*
- * simple commit_write call, set_range_dirty is used to mark both
- * the pages and the extent records as dirty
- */
-int extent_commit_write(struct extent_io_tree *tree,
-			struct inode *inode, struct page *page,
-			unsigned from, unsigned to)
-{
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	set_page_extent_mapped(page);
-	set_page_dirty(page);
-
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
-}
-
-int extent_prepare_write(struct extent_io_tree *tree,
-			 struct inode *inode, struct page *page,
-			 unsigned from, unsigned to, get_extent_t *get_extent)
-{
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	u64 block_start;
-	u64 orig_block_start;
-	u64 block_end;
-	u64 cur_end;
-	struct extent_map *em;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	size_t page_offset = 0;
-	size_t block_off_start;
-	size_t block_off_end;
-	int err = 0;
-	int iocount = 0;
-	int ret = 0;
-	int isnew;
-
-	set_page_extent_mapped(page);
-
-	block_start = (page_start + from) & ~((u64)blocksize - 1);
-	block_end = (page_start + to - 1) | (blocksize - 1);
-	orig_block_start = block_start;
-
-	lock_extent(tree, page_start, page_end, GFP_NOFS);
-	while (block_start <= block_end) {
-		em = get_extent(inode, page, page_offset, block_start,
-				block_end - block_start + 1, 1);
-		if (IS_ERR(em) || !em)
-			goto err;
-
-		cur_end = min(block_end, extent_map_end(em) - 1);
-		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
-		block_off_end = block_off_start + blocksize;
-		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
-
-		if (!PageUptodate(page) && isnew &&
-		    (block_off_end > to || block_off_start < from)) {
-			void *kaddr;
-
-			kaddr = kmap_atomic(page, KM_USER0);
-			if (block_off_end > to)
-				memset(kaddr + to, 0, block_off_end - to);
-			if (block_off_start < from)
-				memset(kaddr + block_off_start, 0,
-				       from - block_off_start);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		if ((em->block_start != EXTENT_MAP_HOLE &&
-		     em->block_start != EXTENT_MAP_INLINE) &&
-		    !isnew && !PageUptodate(page) &&
-		    (block_off_end > to || block_off_start < from) &&
-		    !test_range_bit(tree, block_start, cur_end,
-				    EXTENT_UPTODATE, 1, NULL)) {
-			u64 sector;
-			u64 extent_offset = block_start - em->start;
-			size_t iosize;
-			sector = (em->block_start + extent_offset) >> 9;
-			iosize = (cur_end - block_start + blocksize) &
-				~((u64)blocksize - 1);
-			/*
-			 * we've already got the extent locked, but we
-			 * need to split the state such that our end_bio
-			 * handler can clear the lock.
-			 */
-			set_extent_bit(tree, block_start,
-				       block_start + iosize - 1,
-				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
-			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset, em->bdev,
-					 NULL, 1,
-					 end_bio_extent_preparewrite, 0,
-					 0, 0);
-			if (ret && !err)
-				err = ret;
-			iocount++;
-			block_start = block_start + iosize;
-		} else {
-			set_extent_uptodate(tree, block_start, cur_end,
-					    GFP_NOFS);
-			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
-			block_start = cur_end + 1;
-		}
-		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
-		free_extent_map(em);
-	}
-	if (iocount) {
-		wait_extent_bit(tree, orig_block_start,
-				block_end, EXTENT_LOCKED);
-	}
-	check_page_uptodate(tree, page);
-err:
-	/* FIXME, zero out newly allocated blocks on error */
-	return err;
-}
-
-/*
  * a helper for releasepage, this tests for areas of the page that
  * are locked or under IO and drops the related state bits if it is safe
  * to drop the page.
  */
-int try_release_extent_state(struct extent_map_tree *map,
-			     struct extent_io_tree *tree, struct page *page,
-			     gfp_t mask)
+static int try_release_extent_state(struct extent_map_tree *map,
+				    struct extent_io_tree *tree,
+				    struct page *page, gfp_t mask)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	int ret = 1;
 
@@ -2853,7 +4093,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			       gfp_t mask)
 {
 	struct extent_map *em;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 
 	if ((mask & __GFP_WAIT) &&
@@ -2863,7 +4103,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			len = end - start + 1;
 			write_lock(&map->lock);
 			em = lookup_extent_mapping(map, start, len);
-			if (!em || IS_ERR(em)) {
+			if (!em) {
 				write_unlock(&map->lock);
 				break;
 			}
@@ -2891,33 +4131,6 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	return try_release_extent_state(map, tree, page, mask);
 }
 
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-		get_extent_t *get_extent)
-{
-	struct inode *inode = mapping->host;
-	struct extent_state *cached_state = NULL;
-	u64 start = iblock << inode->i_blkbits;
-	sector_t sector = 0;
-	size_t blksize = (1 << inode->i_blkbits);
-	struct extent_map *em;
-
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-			 0, &cached_state, GFP_NOFS);
-	em = get_extent(inode, NULL, 0, start, blksize, 0);
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
-			     start + blksize - 1, &cached_state, GFP_NOFS);
-	if (!em || IS_ERR(em))
-		return 0;
-
-	if (em->block_start > EXTENT_MAP_LAST_BYTE)
-		goto out;
-
-	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
-out:
-	free_extent_map(em);
-	return sector;
-}
-
 /*
  * helper function for fiemap, which doesn't want to see any holes.
  * This maps until we find something past 'last'
@@ -2934,13 +4147,13 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	if (offset >= last)
 		return NULL;
 
-	while(1) {
+	while (1) {
 		len = last - offset;
 		if (len == 0)
 			break;
-		len = (len + sectorsize - 1) & ~(sectorsize - 1);
+		len = ALIGN(len, sectorsize);
 		em = get_extent(inode, NULL, 0, offset, len, 0);
-		if (!em || IS_ERR(em))
+		if (IS_ERR_OR_NULL(em))
 			return em;
 
 		/* if this isn't a hole return it */
@@ -2958,6 +4171,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	return NULL;
 }
 
+static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
+{
+	unsigned long cnt = *((unsigned long *)ctx);
+
+	cnt++;
+	*((unsigned long *)ctx) = cnt;
+
+	/* Now we're sure that the extent is shared. */
+	if (cnt > 1)
+		return 1;
+	return 0;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2974,12 +4200,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
-	struct btrfs_file_extent_item *item;
 	int end = 0;
 	u64 em_start = 0;
 	u64 em_len = 0;
 	u64 em_end = 0;
-	unsigned long emflags;
 
 	if (len == 0)
 		return -EINVAL;
@@ -2989,25 +4213,26 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
+	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+
 	/*
 	 * lookup the last file extent.  We're not using i_size here
 	 * because there might be preallocation past i_size
 	 */
 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
-				       path, inode->i_ino, -1, 0);
+				       path, btrfs_ino(inode), -1, 0);
 	if (ret < 0) {
 		btrfs_free_path(path);
 		return ret;
 	}
 	WARN_ON(!ret);
 	path->slots[0]--;
-	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			      struct btrfs_file_extent_item);
 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
 
 	/* No extents, but there might be delalloc bits */
-	if (found_key.objectid != inode->i_ino ||
+	if (found_key.objectid != btrfs_ino(inode) ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
 		/* have to trust i_size as the end */
 		last = (u64)-1;
@@ -3021,7 +4246,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		last = found_key.offset;
 		last_for_get_extent = last + 1;
 	}
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 
 	/*
 	 * we might have some extents allocated but more delalloc past those
@@ -3033,10 +4258,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		last_for_get_extent = isize;
 	}
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
-			 &cached_state, GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+			 &cached_state);
 
-	em = get_extent_skip_holes(inode, off, last_for_get_extent,
+	em = get_extent_skip_holes(inode, start, last_for_get_extent,
 				   get_extent);
 	if (!em)
 		goto out;
@@ -3046,7 +4271,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	}
 
 	while (!end) {
-		u64 offset_in_extent;
+		u64 offset_in_extent = 0;
 
 		/* break if the extent we found is outside the range */
 		if (em->start >= max || extent_map_end(em) < off)
@@ -3062,12 +4287,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 		/*
 		 * record the offset from the start of the extent
-		 * for adjusting the disk offset below
+		 * for adjusting the disk offset below.  Only do this if the
+		 * extent isn't compressed since our in ram offset may be past
+		 * what we have actually allocated on disk.
 		 */
-		offset_in_extent = em_start - em->start;
+		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
 		em_len = em_end - em_start;
-		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 
@@ -3088,7 +4315,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
+			unsigned long ref_cnt = 0;
+
 			disko = em->block_start + offset_in_extent;
+
+			/*
+			 * As btrfs supports shared space, this information
+			 * can be exported to userspace tools via
+			 * flag FIEMAP_EXTENT_SHARED.
+			 */
+			ret = iterate_inodes_from_logical(
+					em->block_start,
+					BTRFS_I(inode)->root->fs_info,
+					path, count_ext_ref, &ref_cnt);
+			if (ret < 0 && ret != -ENOENT)
+				goto out_free;
+
+			if (ref_cnt > 1)
+				flags |= FIEMAP_EXTENT_SHARED;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
@@ -3120,79 +4364,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
 	free_extent_map(em);
 out:
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+	btrfs_free_path(path);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
-					      unsigned long i)
-{
-	struct page *p;
-	struct address_space *mapping;
-
-	if (i == 0)
-		return eb->first_page;
-	i += eb->start >> PAGE_CACHE_SHIFT;
-	mapping = eb->first_page->mapping;
-	if (!mapping)
-		return NULL;
-
-	/*
-	 * extent_buffer_page is only called after pinning the page
-	 * by increasing the reference count.  So we know the page must
-	 * be in the radix tree.
-	 */
-	rcu_read_lock();
-	p = radix_tree_lookup(&mapping->page_tree, i);
-	rcu_read_unlock();
-
-	return p;
-}
-
-static inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
-}
-
-static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
-						   u64 start,
-						   unsigned long len,
-						   gfp_t mask)
+static void __free_extent_buffer(struct extent_buffer *eb)
 {
-	struct extent_buffer *eb = NULL;
-#if LEAK_DEBUG
-	unsigned long flags;
-#endif
-
-	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
-	if (eb == NULL)
-		return NULL;
-	eb->start = start;
-	eb->len = len;
-	spin_lock_init(&eb->lock);
-	init_waitqueue_head(&eb->lock_wq);
-
-#if LEAK_DEBUG
-	spin_lock_irqsave(&leak_lock, flags);
-	list_add(&eb->leak_list, &buffers);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-	atomic_set(&eb->refs, 1);
-
-	return eb;
+	btrfs_leak_debug_del(&eb->leak_list);
+	kmem_cache_free(extent_buffer_cache, eb);
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
+int extent_buffer_under_io(struct extent_buffer *eb)
 {
-#if LEAK_DEBUG
-	unsigned long flags;
-	spin_lock_irqsave(&leak_lock, flags);
-	list_del(&eb->leak_list);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-	kmem_cache_free(extent_buffer_cache, eb);
+	return (atomic_read(&eb->io_pages) ||
+		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 }
 
 /*
@@ -3202,20 +4390,50 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
 						unsigned long start_idx)
 {
 	unsigned long index;
+	unsigned long num_pages;
 	struct page *page;
+	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
 
-	if (!eb->first_page)
-		return;
+	BUG_ON(extent_buffer_under_io(eb));
 
-	index = num_extent_pages(eb->start, eb->len);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	index = start_idx + num_pages;
 	if (start_idx >= index)
 		return;
 
 	do {
 		index--;
 		page = extent_buffer_page(eb, index);
-		if (page)
+		if (page && mapped) {
+			spin_lock(&page->mapping->private_lock);
+			/*
+			 * We do this since we'll remove the pages after we've
+			 * removed the eb from the radix tree, so we could race
+			 * and have this page now attached to the new eb.  So
+			 * only clear page_private if it's still connected to
+			 * this eb.
+			 */
+			if (PagePrivate(page) &&
+			    page->private == (unsigned long)eb) {
+				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+				BUG_ON(PageDirty(page));
+				BUG_ON(PageWriteback(page));
+				/*
+				 * We need to make sure we haven't be attached
+				 * to a new eb.
+				 */
+				ClearPagePrivate(page);
+				set_page_private(page, 0);
+				/* One for the page private */
+				page_cache_release(page);
+			}
+			spin_unlock(&page->mapping->private_lock);
+
+		}
+		if (page) {
+			/* One for when we alloced the page */
 			page_cache_release(page);
+		}
 	} while (index != start_idx);
 }
 
@@ -3228,10 +4446,218 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 	__free_extent_buffer(eb);
 }
 
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0,
-					  gfp_t mask)
+static struct extent_buffer *
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+		      unsigned long len, gfp_t mask)
+{
+	struct extent_buffer *eb = NULL;
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	if (eb == NULL)
+		return NULL;
+	eb->start = start;
+	eb->len = len;
+	eb->fs_info = fs_info;
+	eb->bflags = 0;
+	rwlock_init(&eb->lock);
+	atomic_set(&eb->write_locks, 0);
+	atomic_set(&eb->read_locks, 0);
+	atomic_set(&eb->blocking_readers, 0);
+	atomic_set(&eb->blocking_writers, 0);
+	atomic_set(&eb->spinning_readers, 0);
+	atomic_set(&eb->spinning_writers, 0);
+	eb->lock_nested = 0;
+	init_waitqueue_head(&eb->write_lock_wq);
+	init_waitqueue_head(&eb->read_lock_wq);
+
+	btrfs_leak_debug_add(&eb->leak_list, &buffers);
+
+	spin_lock_init(&eb->refs_lock);
+	atomic_set(&eb->refs, 1);
+	atomic_set(&eb->io_pages, 0);
+
+	/*
+	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+	 */
+	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+		> MAX_INLINE_EXTENT_BUFFER_SIZE);
+	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
+
+	return eb;
+}
+
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+	unsigned long i;
+	struct page *p;
+	struct extent_buffer *new;
+	unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+	if (new == NULL)
+		return NULL;
+
+	for (i = 0; i < num_pages; i++) {
+		p = alloc_page(GFP_NOFS);
+		if (!p) {
+			btrfs_release_extent_buffer(new);
+			return NULL;
+		}
+		attach_extent_buffer_page(new, p);
+		WARN_ON(PageDirty(p));
+		SetPageUptodate(p);
+		new->pages[i] = p;
+	}
+
+	copy_extent_buffer(new, src, 0, 0, src->len);
+	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+	return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+	struct extent_buffer *eb;
+	unsigned long num_pages = num_extent_pages(0, len);
+	unsigned long i;
+
+	eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+	if (!eb)
+		return NULL;
+
+	for (i = 0; i < num_pages; i++) {
+		eb->pages[i] = alloc_page(GFP_NOFS);
+		if (!eb->pages[i])
+			goto err;
+	}
+	set_extent_buffer_uptodate(eb);
+	btrfs_set_header_nritems(eb, 0);
+	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+	return eb;
+err:
+	for (; i > 0; i--)
+		__free_page(eb->pages[i - 1]);
+	__free_extent_buffer(eb);
+	return NULL;
+}
+
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+	int refs;
+	/* the ref bit is tricky.  We have to make sure it is set
+	 * if we have the buffer dirty.   Otherwise the
+	 * code to free a buffer can end up dropping a dirty
+	 * page
+	 *
+	 * Once the ref bit is set, it won't go away while the
+	 * buffer is dirty or in writeback, and it also won't
+	 * go away while we have the reference count on the
+	 * eb bumped.
+	 *
+	 * We can't just set the ref bit without bumping the
+	 * ref on the eb because free_extent_buffer might
+	 * see the ref bit and try to clear it.  If this happens
+	 * free_extent_buffer might end up dropping our original
+	 * ref by mistake and freeing the page before we are able
+	 * to add one more ref.
+	 *
+	 * So bump the ref count first, then set the bit.  If someone
+	 * beat us to it, drop the ref we added.
+	 */
+	refs = atomic_read(&eb->refs);
+	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		return;
+
+	spin_lock(&eb->refs_lock);
+	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_inc(&eb->refs);
+	spin_unlock(&eb->refs_lock);
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+		struct page *accessed)
+{
+	unsigned long num_pages, i;
+
+	check_buffer_tree_ref(eb);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+		if (p != accessed)
+			mark_page_accessed(p);
+	}
+}
+
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+					 u64 start)
+{
+	struct extent_buffer *eb;
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&fs_info->buffer_radix,
+			       start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
+		mark_extent_buffer_accessed(eb, NULL);
+		return eb;
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+					       u64 start, unsigned long len)
+{
+	struct extent_buffer *eb, *exists = NULL;
+	int ret;
+
+	eb = find_extent_buffer(fs_info, start);
+	if (eb)
+		return eb;
+	eb = alloc_dummy_extent_buffer(start, len);
+	if (!eb)
+		return NULL;
+	eb->fs_info = fs_info;
+again:
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+	spin_lock(&fs_info->buffer_lock);
+	ret = radix_tree_insert(&fs_info->buffer_radix,
+				start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&fs_info->buffer_lock);
+	radix_tree_preload_end();
+	if (ret == -EEXIST) {
+		exists = find_extent_buffer(fs_info, start);
+		if (exists)
+			goto free_eb;
+		else
+			goto again;
+	}
+	check_buffer_tree_ref(eb);
+	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+	/*
+	 * We will free dummy extent buffer's if they come into
+	 * free_extent_buffer with a ref count of 2, but if we are using this we
+	 * want the buffers to stay in memory until we're done with them, so
+	 * bump the ref count again.
+	 */
+	atomic_inc(&eb->refs);
+	return eb;
+free_eb:
+	btrfs_release_extent_buffer(eb);
+	return exists;
+}
+#endif
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+					  u64 start, unsigned long len)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
@@ -3239,49 +4665,53 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	struct extent_buffer *eb;
 	struct extent_buffer *exists = NULL;
 	struct page *p;
-	struct address_space *mapping = tree->mapping;
+	struct address_space *mapping = fs_info->btree_inode->i_mapping;
 	int uptodate = 1;
 	int ret;
 
-	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (eb && atomic_inc_not_zero(&eb->refs)) {
-		rcu_read_unlock();
-		mark_page_accessed(eb->first_page);
+	eb = find_extent_buffer(fs_info, start);
+	if (eb)
 		return eb;
-	}
-	rcu_read_unlock();
 
-	eb = __alloc_extent_buffer(tree, start, len, mask);
+	eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
 	if (!eb)
 		return NULL;
 
-	if (page0) {
-		eb->first_page = page0;
-		i = 1;
-		index++;
-		page_cache_get(page0);
-		mark_page_accessed(page0);
-		set_page_extent_mapped(page0);
-		set_page_extent_head(page0, len);
-		uptodate = PageUptodate(page0);
-	} else {
-		i = 0;
-	}
-	for (; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
-		if (!p) {
-			WARN_ON(1);
+	for (i = 0; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, GFP_NOFS);
+		if (!p)
 			goto free_eb;
+
+		spin_lock(&mapping->private_lock);
+		if (PagePrivate(p)) {
+			/*
+			 * We could have already allocated an eb for this page
+			 * and attached one so lets see if we can get a ref on
+			 * the existing eb, and if we can we know it's good and
+			 * we can just return that one, else we know we can just
+			 * overwrite page->private.
+			 */
+			exists = (struct extent_buffer *)p->private;
+			if (atomic_inc_not_zero(&exists->refs)) {
+				spin_unlock(&mapping->private_lock);
+				unlock_page(p);
+				page_cache_release(p);
+				mark_extent_buffer_accessed(exists, p);
+				goto free_eb;
+			}
+
+			/*
+			 * Do this so attach doesn't complain and we need to
+			 * drop the ref the old guy had.
+			 */
+			ClearPagePrivate(p);
+			WARN_ON(PageDirty(p));
+			page_cache_release(p);
 		}
-		set_page_extent_mapped(p);
-		mark_page_accessed(p);
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
+		attach_extent_buffer_page(eb, p);
+		spin_unlock(&mapping->private_lock);
+		WARN_ON(PageDirty(p));
+		eb->pages[i] = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 
@@ -3289,31 +4719,29 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		 * see below about how we avoid a nasty race with release page
 		 * and why we unlock later
 		 */
-		if (i != 0)
-			unlock_page(p);
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret)
 		goto free_eb;
 
-	spin_lock(&tree->buffer_lock);
-	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+	spin_lock(&fs_info->buffer_lock);
+	ret = radix_tree_insert(&fs_info->buffer_radix,
+				start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&fs_info->buffer_lock);
+	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = radix_tree_lookup(&tree->buffer,
-						start >> PAGE_CACHE_SHIFT);
-		/* add one reference for the caller */
-		atomic_inc(&exists->refs);
-		spin_unlock(&tree->buffer_lock);
-		radix_tree_preload_end();
-		goto free_eb;
+		exists = find_extent_buffer(fs_info, start);
+		if (exists)
+			goto free_eb;
+		else
+			goto again;
 	}
 	/* add one reference for the tree */
-	atomic_inc(&eb->refs);
-	spin_unlock(&tree->buffer_lock);
-	radix_tree_preload_end();
+	check_buffer_tree_ref(eb);
+	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
 
 	/*
 	 * there is a race where release page may have
@@ -3324,53 +4752,111 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	 * after the extent buffer is in the radix tree so
 	 * it doesn't get lost
 	 */
-	set_page_extent_mapped(eb->first_page);
-	set_page_extent_head(eb->first_page, eb->len);
-	if (!page0)
-		unlock_page(eb->first_page);
+	SetPageChecked(eb->pages[0]);
+	for (i = 1; i < num_pages; i++) {
+		p = extent_buffer_page(eb, i);
+		ClearPageChecked(p);
+		unlock_page(p);
+	}
+	unlock_page(eb->pages[0]);
 	return eb;
 
 free_eb:
-	if (eb->first_page && !page0)
-		unlock_page(eb->first_page);
+	for (i = 0; i < num_pages; i++) {
+		if (eb->pages[i])
+			unlock_page(eb->pages[i]);
+	}
 
-	if (!atomic_dec_and_test(&eb->refs))
-		return exists;
+	WARN_ON(!atomic_dec_and_test(&eb->refs));
 	btrfs_release_extent_buffer(eb);
 	return exists;
 }
 
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 u64 start, unsigned long len,
-					  gfp_t mask)
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 {
-	struct extent_buffer *eb;
+	struct extent_buffer *eb =
+			container_of(head, struct extent_buffer, rcu_head);
 
-	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (eb && atomic_inc_not_zero(&eb->refs)) {
-		rcu_read_unlock();
-		mark_page_accessed(eb->first_page);
-		return eb;
+	__free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static int release_extent_buffer(struct extent_buffer *eb)
+{
+	WARN_ON(atomic_read(&eb->refs) == 0);
+	if (atomic_dec_and_test(&eb->refs)) {
+		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
+			struct btrfs_fs_info *fs_info = eb->fs_info;
+
+			spin_unlock(&eb->refs_lock);
+
+			spin_lock(&fs_info->buffer_lock);
+			radix_tree_delete(&fs_info->buffer_radix,
+					  eb->start >> PAGE_CACHE_SHIFT);
+			spin_unlock(&fs_info->buffer_lock);
+		} else {
+			spin_unlock(&eb->refs_lock);
+		}
+
+		/* Should be safe to release our pages at this point */
+		btrfs_release_extent_buffer_page(eb, 0);
+		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+		return 1;
 	}
-	rcu_read_unlock();
+	spin_unlock(&eb->refs_lock);
 
-	return NULL;
+	return 0;
 }
 
 void free_extent_buffer(struct extent_buffer *eb)
 {
+	int refs;
+	int old;
 	if (!eb)
 		return;
 
-	if (!atomic_dec_and_test(&eb->refs))
+	while (1) {
+		refs = atomic_read(&eb->refs);
+		if (refs <= 3)
+			break;
+		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+		if (old == refs)
+			return;
+	}
+
+	spin_lock(&eb->refs_lock);
+	if (atomic_read(&eb->refs) == 2 &&
+	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+		atomic_dec(&eb->refs);
+
+	if (atomic_read(&eb->refs) == 2 &&
+	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+	    !extent_buffer_under_io(eb) &&
+	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_dec(&eb->refs);
+
+	/*
+	 * I know this is terrible, but it's temporary until we stop tracking
+	 * the uptodate bits and such for the extent buffers.
+	 */
+	release_extent_buffer(eb);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+	if (!eb)
 		return;
 
-	WARN_ON(1);
+	spin_lock(&eb->refs_lock);
+	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_dec(&eb->refs);
+	release_extent_buffer(eb);
 }
 
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
-			      struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
 {
 	unsigned long i;
 	unsigned long num_pages;
@@ -3386,10 +4872,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 		lock_page(page);
 		WARN_ON(!PagePrivate(page));
 
-		set_page_extent_mapped(page);
-		if (i == 0)
-			set_page_extent_head(page, eb->len);
-
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
@@ -3398,45 +4880,39 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 						PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&page->mapping->tree_lock);
+		ClearPageError(page);
 		unlock_page(page);
 	}
-	return 0;
-}
-
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
-				    struct extent_buffer *eb)
-{
-	return wait_on_extent_writeback(tree, eb->start,
-					eb->start + eb->len - 1);
+	WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-			     struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
 {
 	unsigned long i;
 	unsigned long num_pages;
 	int was_dirty = 0;
 
+	check_buffer_tree_ref(eb);
+
 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
 	num_pages = num_extent_pages(eb->start, eb->len);
+	WARN_ON(atomic_read(&eb->refs) == 0);
+	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
 	for (i = 0; i < num_pages; i++)
-		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		set_page_dirty(extent_buffer_page(eb, i));
 	return was_dirty;
 }
 
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb,
-				struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
 	unsigned long num_pages;
 
-	num_pages = num_extent_pages(eb->start, eb->len);
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
-	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-			      cached_state, GFP_NOFS);
+	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (page)
@@ -3445,88 +4921,28 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 	return 0;
 }
 
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
 	unsigned long num_pages;
 
+	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
-
-	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-			    GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			check_page_uptodate(tree, page);
-			continue;
-		}
 		SetPageUptodate(page);
 	}
 	return 0;
 }
 
-int extent_range_uptodate(struct extent_io_tree *tree,
-			  u64 start, u64 end)
+int extent_buffer_uptodate(struct extent_buffer *eb)
 {
-	struct page *page;
-	int ret;
-	int pg_uptodate = 1;
-	int uptodate;
-	unsigned long index;
-
-	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
-	if (ret)
-		return 1;
-	while (start <= end) {
-		index = start >> PAGE_CACHE_SHIFT;
-		page = find_get_page(tree->mapping, index);
-		uptodate = PageUptodate(page);
-		page_cache_release(page);
-		if (!uptodate) {
-			pg_uptodate = 0;
-			break;
-		}
-		start += PAGE_CACHE_SIZE;
-	}
-	return pg_uptodate;
-}
-
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb,
-			   struct extent_state *cached_state)
-{
-	int ret = 0;
-	unsigned long num_pages;
-	unsigned long i;
-	struct page *page;
-	int pg_uptodate = 1;
-
-	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-		return 1;
-
-	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1, cached_state);
-	if (ret)
-		return ret;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		if (!PageUptodate(page)) {
-			pg_uptodate = 0;
-			break;
-		}
-	}
-	return pg_uptodate;
+	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb,
-			     u64 start, int wait,
+			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
@@ -3536,19 +4952,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	int ret = 0;
 	int locked_pages = 0;
 	int all_uptodate = 1;
-	int inc_all_pages = 0;
 	unsigned long num_pages;
+	unsigned long num_reads = 0;
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
-	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1, NULL)) {
-		return 0;
-	}
-
 	if (start) {
 		WARN_ON(start < eb->start);
 		start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -3560,15 +4971,17 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!wait) {
+		if (wait == WAIT_NONE) {
 			if (!trylock_page(page))
 				goto unlock_exit;
 		} else {
 			lock_page(page);
 		}
 		locked_pages++;
-		if (!PageUptodate(page))
+		if (!PageUptodate(page)) {
+			num_reads++;
 			all_uptodate = 0;
+		}
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
@@ -3576,24 +4989,17 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		goto unlock_exit;
 	}
 
+	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+	eb->read_mirror = 0;
+	atomic_set(&eb->io_pages, num_reads);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-
-		WARN_ON(!PagePrivate(page));
-
-		set_page_extent_mapped(page);
-		if (i == 0)
-			set_page_extent_head(page, eb->len);
-
-		if (inc_all_pages)
-			page_cache_get(page);
 		if (!PageUptodate(page)) {
-			if (start_i == 0)
-				inc_all_pages = 1;
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
-						      mirror_num, &bio_flags);
+						      mirror_num, &bio_flags,
+						      READ | REQ_META);
 			if (err)
 				ret = err;
 		} else {
@@ -3601,10 +5007,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		}
 	}
 
-	if (bio)
-		submit_one_bio(READ, bio, mirror_num, bio_flags);
+	if (bio) {
+		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
+				     bio_flags);
+		if (err)
+			return err;
+	}
 
-	if (ret || !wait)
+	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
 	for (i = start_i; i < num_pages; i++) {
@@ -3614,8 +5024,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			ret = -EIO;
 	}
 
-	if (!ret)
-		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	return ret;
 
 unlock_exit:
@@ -3644,15 +5052,14 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
-		kaddr = kmap_atomic(page, KM_USER1);
+		kaddr = page_address(page);
 		memcpy(dst, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER1);
 
 		dst += cur;
 		len -= cur;
@@ -3661,10 +5068,47 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	}
 }
 
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char __user *dst = (char __user *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = page_address(page);
+		if (copy_to_user(dst, kaddr + offset, cur)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+
+	return ret;
+}
+
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-			       unsigned long min_len, char **token, char **map,
+			       unsigned long min_len, char **map,
 			       unsigned long *map_start,
-			       unsigned long *map_len, int km)
+			       unsigned long *map_len)
 {
 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
 	char *kaddr;
@@ -3686,49 +5130,19 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	}
 
 	if (start + min_len > eb->len) {
-		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
-		       "wanted %lu %lu\n", (unsigned long long)eb->start,
-		       eb->len, start, min_len);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n",
+		       eb->start, eb->len, start, min_len);
+		return -EINVAL;
 	}
 
 	p = extent_buffer_page(eb, i);
-	kaddr = kmap_atomic(p, km);
-	*token = kaddr;
+	kaddr = page_address(p);
 	*map = kaddr + offset;
 	*map_len = PAGE_CACHE_SIZE - offset;
 	return 0;
 }
 
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
-		      unsigned long min_len,
-		      char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km)
-{
-	int err;
-	int save = 0;
-	if (eb->map_token) {
-		unmap_extent_buffer(eb, eb->map_token, km);
-		eb->map_token = NULL;
-		save = 1;
-	}
-	err = map_private_extent_buffer(eb, start, min_len, token, map,
-				       map_start, map_len, km);
-	if (!err && save) {
-		eb->map_token = *token;
-		eb->kaddr = *map;
-		eb->map_start = *map_start;
-		eb->map_len = *map_len;
-	}
-	return err;
-}
-
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
-	kunmap_atomic(token, km);
-}
-
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len)
@@ -3745,16 +5159,15 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
 		ret = memcmp(ptr, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
 		if (ret)
 			break;
 
@@ -3780,16 +5193,15 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER1);
+		kaddr = page_address(page);
 		memcpy(kaddr + offset, src, cur);
-		kunmap_atomic(kaddr, KM_USER1);
 
 		src += cur;
 		len -= cur;
@@ -3811,16 +5223,15 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
 		memset(kaddr + offset, c, cur);
-		kunmap_atomic(kaddr, KM_USER0);
 
 		len -= cur;
 		offset = 0;
@@ -3843,7 +5254,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	WARN_ON(src->len != dst_len);
 
 	offset = (start_offset + dst_offset) &
-		((unsigned long)PAGE_CACHE_SIZE - 1);
+		(PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(dst, i);
@@ -3851,9 +5262,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
 
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
 
 		src_offset += cur;
 		len -= cur;
@@ -3862,42 +5272,32 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	}
 }
 
-static void move_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
 {
-	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
-	if (dst_page == src_page) {
-		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
-	} else {
-		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
-		char *p = dst_kaddr + dst_off + len;
-		char *s = src_kaddr + src_off + len;
-
-		while (len--)
-			*--p = *--s;
-
-		kunmap_atomic(src_kaddr, KM_USER1);
-	}
-	kunmap_atomic(dst_kaddr, KM_USER0);
+	unsigned long distance = (src > dst) ? src - dst : dst - src;
+	return distance < len;
 }
 
 static void copy_pages(struct page *dst_page, struct page *src_page,
 		       unsigned long dst_off, unsigned long src_off,
 		       unsigned long len)
 {
-	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	char *dst_kaddr = page_address(dst_page);
 	char *src_kaddr;
+	int must_memmove = 0;
 
-	if (dst_page != src_page)
-		src_kaddr = kmap_atomic(src_page, KM_USER1);
-	else
+	if (dst_page != src_page) {
+		src_kaddr = page_address(src_page);
+	} else {
 		src_kaddr = dst_kaddr;
+		if (areas_overlap(src_off, dst_off, len))
+			must_memmove = 1;
+	}
 
-	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-	kunmap_atomic(dst_kaddr, KM_USER0);
-	if (dst_page != src_page)
-		kunmap_atomic(src_kaddr, KM_USER1);
+	if (must_memmove)
+		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	else
+		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -3911,21 +5311,21 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 
 	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 
 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
@@ -3958,12 +5358,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
 		       "len %lu len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
 		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
@@ -3976,13 +5376,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
 		dst_off_in_page = (start_offset + dst_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
-		move_pages(extent_buffer_page(dst, dst_i),
+		copy_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
 			   src_off_in_page - cur + 1, cur);
@@ -3993,47 +5393,44 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	}
 }
 
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+int try_release_extent_buffer(struct page *page)
 {
-	struct extent_buffer *eb =
-			container_of(head, struct extent_buffer, rcu_head);
-
-	btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
-{
-	u64 start = page_offset(page);
 	struct extent_buffer *eb;
-	int ret = 1;
 
-	spin_lock(&tree->buffer_lock);
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (!eb) {
-		spin_unlock(&tree->buffer_lock);
-		return ret;
+	/*
+	 * We need to make sure noboody is attaching this page to an eb right
+	 * now.
+	 */
+	spin_lock(&page->mapping->private_lock);
+	if (!PagePrivate(page)) {
+		spin_unlock(&page->mapping->private_lock);
+		return 1;
 	}
 
-	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		ret = 0;
-		goto out;
-	}
+	eb = (struct extent_buffer *)page->private;
+	BUG_ON(!eb);
 
 	/*
-	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
-	 * Or go back.
+	 * This is a little awful but should be ok, we need to make sure that
+	 * the eb doesn't disappear out from under us while we're looking at
+	 * this page.
 	 */
-	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
-		ret = 0;
-		goto out;
+	spin_lock(&eb->refs_lock);
+	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+		spin_unlock(&eb->refs_lock);
+		spin_unlock(&page->mapping->private_lock);
+		return 0;
 	}
+	spin_unlock(&page->mapping->private_lock);
 
-	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
-	spin_unlock(&tree->buffer_lock);
+	/*
+	 * If tree ref isn't set then we know the ref on this eb is a real ref,
+	 * so just return, this page will likely be freed soon anyway.
+	 */
+	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+		spin_unlock(&eb->refs_lock);
+		return 0;
+	}
 
-	/* at this point we can safely release the extent buffer */
-	if (atomic_read(&eb->refs) == 0)
-		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
-	return ret;
+	return release_extent_buffer(eb);
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9318dfefd59..ccc264e7bde 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,9 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_NEED_WAIT (1 << 13)
+#define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
@@ -25,22 +28,29 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
+#define EXTENT_BIO_PARENT_LOCKED 4
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
+#define EXTENT_BUFFER_IN_TREE 10
 
 /* these are flags for extent_clear_unlock_delalloc */
-#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-#define EXTENT_CLEAR_UNLOCK	 0x2
-#define EXTENT_CLEAR_DELALLOC	 0x4
-#define EXTENT_CLEAR_DIRTY	 0x8
-#define EXTENT_SET_WRITEBACK	 0x10
-#define EXTENT_END_WRITEBACK	 0x20
-#define EXTENT_SET_PRIVATE2	 0x40
-#define EXTENT_CLEAR_ACCOUNTING  0x80
+#define PAGE_UNLOCK		(1 << 0)
+#define PAGE_CLEAR_DIRTY	(1 << 1)
+#define PAGE_SET_WRITEBACK	(1 << 2)
+#define PAGE_END_WRITEBACK	(1 << 3)
+#define PAGE_SET_PRIVATE2	(1 << 4)
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -50,6 +60,8 @@
 #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
 
 struct extent_state;
+struct btrfs_root;
+struct btrfs_io_bio;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       struct bio *bio, int mirror_num,
@@ -61,39 +73,32 @@ struct extent_io_ops {
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
-	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio,
 			      unsigned long bio_flags);
-	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end,
-				       struct extent_state *state);
-	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
-					u64 start, u64 end,
-				       struct extent_state *state);
-	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
-				    struct extent_state *state);
+	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
+				    struct page *page, u64 start, u64 end,
+				    int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
-	int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-			    int *bits);
-	int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-			      int *bits);
-	int (*merge_extent_hook)(struct inode *inode,
-				 struct extent_state *new,
-				 struct extent_state *other);
-	int (*split_extent_hook)(struct inode *inode,
-				 struct extent_state *orig, u64 split);
-	int (*write_cache_pages_lock_hook)(struct page *page);
+	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+			     unsigned long *bits);
+	void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+			       unsigned long *bits);
+	void (*merge_extent_hook)(struct inode *inode,
+				  struct extent_state *new,
+				  struct extent_state *other);
+	void (*split_extent_hook)(struct inode *inode,
+				  struct extent_state *orig, u64 split);
 };
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct radix_tree_root buffer;
 	struct address_space *mapping;
 	u64 dirty_bytes;
+	int track_uptodate;
 	spinlock_t lock;
-	spinlock_t buffer_lock;
 	struct extent_io_ops *ops;
 };
 
@@ -107,36 +112,56 @@ struct extent_state {
 	wait_queue_head_t wq;
 	atomic_t refs;
 	unsigned long state;
-	u64 split_start;
-	u64 split_end;
 
 	/* for use by the FS */
 	u64 private;
 
+#ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
+#endif
 };
 
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
-	char *map_token;
-	char *kaddr;
 	unsigned long map_start;
 	unsigned long map_len;
-	struct page *first_page;
 	unsigned long bflags;
+	struct btrfs_fs_info *fs_info;
+	spinlock_t refs_lock;
 	atomic_t refs;
-	struct list_head leak_list;
+	atomic_t io_pages;
+	int read_mirror;
 	struct rcu_head rcu_head;
+	pid_t lock_owner;
 
-	/* the spinlock is used to protect most operations */
-	spinlock_t lock;
+	/* count of read lock holders on the extent buffer */
+	atomic_t write_locks;
+	atomic_t read_locks;
+	atomic_t blocking_writers;
+	atomic_t blocking_readers;
+	atomic_t spinning_readers;
+	atomic_t spinning_writers;
+	int lock_nested;
+
+	/* protects write locks */
+	rwlock_t lock;
 
-	/*
-	 * when we keep the lock held while blocking, waiters go onto
-	 * the wq
+	/* readers use lock_wq while they wait for the write
+	 * lock holders to unlock
 	 */
-	wait_queue_head_t lock_wq;
+	wait_queue_head_t write_lock_wq;
+
+	/* writers use read_lock_wq while they wait for readers
+	 * to unlock
+	 */
+	wait_queue_head_t read_lock_wq;
+	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+#ifdef CONFIG_BTRFS_DEBUG
+	struct list_head leak_list;
+#endif
 };
 
 static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -152,40 +177,29 @@ static inline int extent_compress_type(unsigned long bio_flags)
 
 struct extent_map_tree;
 
-static inline struct extent_state *extent_state_next(struct extent_state *state)
-{
-	struct rb_node *node;
-	node = rb_next(&state->rb_node);
-	if (!node)
-		return NULL;
-	return rb_entry(node, struct extent_state, rb_node);
-}
-
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
 					  struct page *page,
-					  size_t page_offset,
+					  size_t pg_offset,
 					  u64 start, u64 len,
 					  int create);
 
 void extent_io_tree_init(struct extent_io_tree *tree,
-			  struct address_space *mapping, gfp_t mask);
+			 struct address_space *mapping);
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
-int try_release_extent_state(struct extent_map_tree *map,
-			     struct extent_io_tree *tree, struct page *page,
-			     gfp_t mask);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_release_extent_buffer(struct page *page);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, struct extent_state **cached, gfp_t mask);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+		     unsigned long bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 			 struct extent_state **cached, gfp_t mask);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-		    gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			  get_extent_t *get_extent);
+			  get_extent_t *get_extent, int mirror_num);
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
@@ -195,37 +209,38 @@ u64 count_range_bits(struct extent_io_tree *tree,
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled, struct extent_state *cached_state);
+		   unsigned long bits, int filled,
+		   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask);
+		      unsigned long bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, struct extent_state **cached,
-		     gfp_t mask);
+		     unsigned long bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask);
+		    unsigned long bits, gfp_t mask);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int exclusive_bits, u64 *failed_start,
+		   unsigned long bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			gfp_t mask);
+			struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-		       gfp_t mask);
-int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
-				  u64 end, gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       unsigned long bits, unsigned long clear_bits,
+		       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
-						 u64 start, int bits);
+			  u64 *start_ret, u64 *end_ret, unsigned long bits,
+			  struct extent_state **cached_state);
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
@@ -238,36 +253,44 @@ int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+			    struct writeback_control *wbc);
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
 		     struct list_head *pages, unsigned nr_pages,
 		     get_extent_t get_extent);
-int extent_prepare_write(struct extent_io_tree *tree,
-			 struct inode *inode, struct page *page,
-			 unsigned from, unsigned to, get_extent_t *get_extent);
-int extent_commit_write(struct extent_io_tree *tree,
-			struct inode *inode, struct page *page,
-			unsigned from, unsigned to);
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-		get_extent_t *get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent);
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
-int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0,
-					  gfp_t mask);
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 u64 start, unsigned long len,
-					  gfp_t mask);
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+					  u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
+#define WAIT_NONE	0
+#define WAIT_COMPLETE	1
+#define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	return eb->pages[i];
+}
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -280,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 void read_extent_buffer(struct extent_buffer *eb, void *dst,
 			unsigned long start,
 			unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+			       unsigned long start,
+			       unsigned long len);
 void write_extent_buffer(struct extent_buffer *eb, const void *src,
 			 unsigned long start, unsigned long len);
 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
@@ -291,41 +317,42 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			   unsigned long src_offset, unsigned long len);
 void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len);
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
-				    struct extent_buffer *eb);
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
-			      struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-			     struct extent_buffer *eb);
-int test_extent_buffer_dirty(struct extent_io_tree *tree,
-			     struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-			       struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb,
-				struct extent_state **cached_state);
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb,
-			   struct extent_state *cached_state);
-int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **token, char **map,
+		      unsigned long min_len, char **map,
 		      unsigned long *map_start,
-		      unsigned long *map_len, int km);
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
-int release_extent_buffer_tail_pages(struct extent_buffer *eb);
-int extent_range_uptodate(struct extent_io_tree *tree,
-			  u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode,
-				struct extent_io_tree *tree,
-				u64 start, u64 end, struct page *locked_page,
-				unsigned long op);
+		      unsigned long *map_len);
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned long bits_to_clear,
+				 unsigned long page_ops);
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
+
+struct btrfs_fs_info;
+
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num);
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+			 int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+				      struct extent_io_tree *tree,
+				      struct page *locked_page, u64 *start,
+				      u64 *end, u64 max_bytes);
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+					       u64 start, unsigned long len);
+#endif
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2b6c12e983b..225302b39af 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include "ctree.h"
@@ -11,7 +10,7 @@ static struct kmem_cache *extent_map_cache;
 
 int __init extent_map_init(void)
 {
-	extent_map_cache = kmem_cache_create("extent_map",
+	extent_map_cache = kmem_cache_create("btrfs_extent_map",
 			sizeof(struct extent_map), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_map_cache)
@@ -28,35 +27,36 @@ void extent_map_exit(void)
 /**
  * extent_map_tree_init - initialize extent map tree
  * @tree:		tree to initialize
- * @mask:		flags for memory allocations during tree operations
  *
  * Initialize the extent tree @tree.  Should be called for each new inode
  * or other user of the extent_map interface.
  */
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+void extent_map_tree_init(struct extent_map_tree *tree)
 {
 	tree->map = RB_ROOT;
+	INIT_LIST_HEAD(&tree->modified_extents);
 	rwlock_init(&tree->lock);
 }
 
 /**
  * alloc_extent_map - allocate new extent map structure
- * @mask:	memory allocation flags
  *
  * Allocate a new extent_map structure.  The new structure is
  * returned with a reference count of one and needs to be
  * freed using free_extent_map()
  */
-struct extent_map *alloc_extent_map(gfp_t mask)
+struct extent_map *alloc_extent_map(void)
 {
 	struct extent_map *em;
-	em = kmem_cache_alloc(extent_map_cache, mask);
+	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
 	if (!em)
 		return NULL;
-	em->in_tree = 0;
+	RB_CLEAR_NODE(&em->rb_node);
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
+	em->generation = 0;
 	atomic_set(&em->refs, 1);
+	INIT_LIST_HEAD(&em->list);
 	return em;
 }
 
@@ -73,37 +73,64 @@ void free_extent_map(struct extent_map *em)
 		return;
 	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
-		WARN_ON(em->in_tree);
+		WARN_ON(extent_map_in_tree(em));
+		WARN_ON(!list_empty(&em->list));
+		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+			kfree(em->bdev);
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-				   struct rb_node *node)
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
+static int tree_insert(struct rb_root *root, struct extent_map *em)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
-	struct extent_map *entry;
+	struct extent_map *entry = NULL;
+	struct rb_node *orig_parent = NULL;
+	u64 end = range_end(em->start, em->len);
 
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
-		WARN_ON(!entry->in_tree);
-
-		if (offset < entry->start)
+		if (em->start < entry->start)
 			p = &(*p)->rb_left;
-		else if (offset >= extent_map_end(entry))
+		else if (em->start >= extent_map_end(entry))
 			p = &(*p)->rb_right;
 		else
-			return parent;
+			return -EEXIST;
 	}
 
-	entry = rb_entry(node, struct extent_map, rb_node);
-	entry->in_tree = 1;
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-	return NULL;
+	orig_parent = parent;
+	while (parent && em->start >= extent_map_end(entry)) {
+		parent = rb_next(parent);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+	}
+	if (parent)
+		if (end > entry->start && em->start < extent_map_end(entry))
+			return -EEXIST;
+
+	parent = orig_parent;
+	entry = rb_entry(parent, struct extent_map, rb_node);
+	while (parent && em->start < entry->start) {
+		parent = rb_prev(parent);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+	}
+	if (parent)
+		if (end > entry->start && em->start < extent_map_end(entry))
+			return -EEXIST;
+
+	rb_link_node(&em->rb_node, orig_parent, p);
+	rb_insert_color(&em->rb_node, root);
+	return 0;
 }
 
 /*
@@ -125,8 +152,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 		prev = n;
 		prev_entry = entry;
 
-		WARN_ON(!entry->in_tree);
-
 		if (offset < entry->start)
 			n = n->rb_left;
 		else if (offset >= extent_map_end(entry))
@@ -169,6 +194,18 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
 		return 0;
 
+	if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+	    test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+		return 0;
+
+	/*
+	 * We don't want to merge stuff that hasn't been written to the log yet
+	 * since it may not reflect exactly what is on disk, and that would be
+	 * bad.
+	 */
+	if (!list_empty(&prev->list) || !list_empty(&next->list))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -185,22 +222,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	return 0;
 }
 
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 {
-	int ret = 0;
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
-	struct extent_map *em;
-
-	write_lock(&tree->lock);
-	em = lookup_extent_mapping(tree, start, len);
-
-	WARN_ON(!em || em->start != start);
-
-	if (!em)
-		goto out;
-
-	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 	if (em->start != 0) {
 		rb = rb_prev(&em->rb_node);
@@ -208,11 +233,16 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
 			merge = rb_entry(rb, struct extent_map, rb_node);
 		if (rb && mergable_maps(merge, em)) {
 			em->start = merge->start;
+			em->orig_start = merge->orig_start;
 			em->len += merge->len;
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
-			merge->in_tree = 0;
+			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+			em->mod_start = merge->mod_start;
+			em->generation = max(em->generation, merge->generation);
+
 			rb_erase(&merge->rb_node, &tree->map);
+			RB_CLEAR_NODE(&merge->rb_node);
 			free_extent_map(merge);
 		}
 	}
@@ -222,11 +252,59 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
 		merge = rb_entry(rb, struct extent_map, rb_node);
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
-		em->block_len += merge->len;
+		em->block_len += merge->block_len;
 		rb_erase(&merge->rb_node, &tree->map);
-		merge->in_tree = 0;
+		RB_CLEAR_NODE(&merge->rb_node);
+		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+		em->generation = max(em->generation, merge->generation);
 		free_extent_map(merge);
 	}
+}
+
+/**
+ * unpin_extent_cache - unpin an extent from the cache
+ * @tree:	tree to unpin the extent in
+ * @start:	logical offset in the file
+ * @len:	length of the extent
+ * @gen:	generation that this extent has been modified in
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+		       u64 gen)
+{
+	int ret = 0;
+	struct extent_map *em;
+	bool prealloc = false;
+
+	write_lock(&tree->lock);
+	em = lookup_extent_mapping(tree, start, len);
+
+	WARN_ON(!em || em->start != start);
+
+	if (!em)
+		goto out;
+
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_move(&em->list, &tree->modified_extents);
+	em->generation = gen;
+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+		prealloc = true;
+		clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+	}
+
+	try_merge_map(tree, em);
+
+	if (prealloc) {
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+	}
 
 	free_extent_map(em);
 out:
@@ -235,6 +313,27 @@ out:
 
 }
 
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+	if (extent_map_in_tree(em))
+		try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+					struct extent_map *em,
+					int modified)
+{
+	atomic_inc(&em->refs);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (modified)
+		list_move(&em->list, &tree->modified_extents);
+	else
+		try_merge_map(tree, em);
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:	tree to insert new map in
@@ -243,62 +342,49 @@ out:
  * Insert @em into @tree or perform a simple forward/backward merge with
  * existing mappings.  The extent_map struct passed in will be inserted
  * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successfull.
+ * reference dropped if the merge attempt was successful.
  */
 int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em)
+		       struct extent_map *em, int modified)
 {
 	int ret = 0;
-	struct extent_map *merge = NULL;
-	struct rb_node *rb;
-	struct extent_map *exist;
 
-	exist = lookup_extent_mapping(tree, em->start, em->len);
-	if (exist) {
-		free_extent_map(exist);
-		ret = -EEXIST;
-		goto out;
-	}
-	rb = tree_insert(&tree->map, em->start, &em->rb_node);
-	if (rb) {
-		ret = -EEXIST;
+	ret = tree_insert(&tree->map, em);
+	if (ret)
 		goto out;
-	}
-	atomic_inc(&em->refs);
-	if (em->start != 0) {
-		rb = rb_prev(&em->rb_node);
-		if (rb)
-			merge = rb_entry(rb, struct extent_map, rb_node);
-		if (rb && mergable_maps(merge, em)) {
-			em->start = merge->start;
-			em->len += merge->len;
-			em->block_len += merge->block_len;
-			em->block_start = merge->block_start;
-			merge->in_tree = 0;
-			rb_erase(&merge->rb_node, &tree->map);
-			free_extent_map(merge);
-		}
-	 }
-	rb = rb_next(&em->rb_node);
-	if (rb)
-		merge = rb_entry(rb, struct extent_map, rb_node);
-	if (rb && mergable_maps(em, merge)) {
-		em->len += merge->len;
-		em->block_len += merge->len;
-		rb_erase(&merge->rb_node, &tree->map);
-		merge->in_tree = 0;
-		free_extent_map(merge);
-	}
+
+	setup_extent_mapping(tree, em, modified);
 out:
 	return ret;
 }
 
-/* simple helper to do math around the end of an extent, handling wrap */
-static u64 range_end(u64 start, u64 len)
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+			u64 start, u64 len, int strict)
 {
-	if (start + len < start)
-		return (u64)-1;
-	return start + len;
+	struct extent_map *em;
+	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+	u64 end = range_end(start, len);
+
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node) {
+		if (prev)
+			rb_node = prev;
+		else if (next)
+			rb_node = next;
+		else
+			return NULL;
+	}
+
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+
+	if (strict && !(end > em->start && start < extent_map_end(em)))
+		return NULL;
+
+	atomic_inc(&em->refs);
+	return em;
 }
 
 /**
@@ -315,42 +401,7 @@ static u64 range_end(u64 start, u64 len)
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len)
 {
-	struct extent_map *em;
-	struct rb_node *rb_node;
-	struct rb_node *prev = NULL;
-	struct rb_node *next = NULL;
-	u64 end = range_end(start, len);
-
-	rb_node = __tree_search(&tree->map, start, &prev, &next);
-	if (!rb_node && prev) {
-		em = rb_entry(prev, struct extent_map, rb_node);
-		if (end > em->start && start < extent_map_end(em))
-			goto found;
-	}
-	if (!rb_node && next) {
-		em = rb_entry(next, struct extent_map, rb_node);
-		if (end > em->start && start < extent_map_end(em))
-			goto found;
-	}
-	if (!rb_node) {
-		em = NULL;
-		goto out;
-	}
-	if (IS_ERR(rb_node)) {
-		em = ERR_CAST(rb_node);
-		goto out;
-	}
-	em = rb_entry(rb_node, struct extent_map, rb_node);
-	if (end > em->start && start < extent_map_end(em))
-		goto found;
-
-	em = NULL;
-	goto out;
-
-found:
-	atomic_inc(&em->refs);
-out:
-	return em;
+	return __lookup_extent_mapping(tree, start, len, 1);
 }
 
 /**
@@ -367,38 +418,7 @@ out:
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len)
 {
-	struct extent_map *em;
-	struct rb_node *rb_node;
-	struct rb_node *prev = NULL;
-	struct rb_node *next = NULL;
-
-	rb_node = __tree_search(&tree->map, start, &prev, &next);
-	if (!rb_node && prev) {
-		em = rb_entry(prev, struct extent_map, rb_node);
-		goto found;
-	}
-	if (!rb_node && next) {
-		em = rb_entry(next, struct extent_map, rb_node);
-		goto found;
-	}
-	if (!rb_node) {
-		em = NULL;
-		goto out;
-	}
-	if (IS_ERR(rb_node)) {
-		em = ERR_CAST(rb_node);
-		goto out;
-	}
-	em = rb_entry(rb_node, struct extent_map, rb_node);
-	goto found;
-
-	em = NULL;
-	goto out;
-
-found:
-	atomic_inc(&em->refs);
-out:
-	return em;
+	return __lookup_extent_mapping(tree, start, len, 0);
 }
 
 /**
@@ -415,6 +435,23 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	rb_erase(&em->rb_node, &tree->map);
-	em->in_tree = 0;
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_del_init(&em->list);
+	RB_CLEAR_NODE(&em->rb_node);
 	return ret;
 }
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified)
+{
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+	ASSERT(extent_map_in_tree(cur));
+	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+		list_del_init(&cur->list);
+	rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+	RB_CLEAR_NODE(&cur->rb_node);
+
+	setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 28b44dbd1e3..b2991fd8583 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,16 +3,19 @@
 
 #include <linux/rbtree.h>
 
-#define EXTENT_MAP_LAST_BYTE (u64)-4
-#define EXTENT_MAP_HOLE (u64)-3
-#define EXTENT_MAP_INLINE (u64)-2
-#define EXTENT_MAP_DELALLOC (u64)-1
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -20,21 +23,32 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 mod_start;
+	u64 mod_len;
 	u64 orig_start;
+	u64 orig_block_len;
+	u64 ram_bytes;
 	u64 block_start;
 	u64 block_len;
+	u64 generation;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-	unsigned int in_tree:1;
-	unsigned int compress_type:4;
+	unsigned int compress_type;
+	struct list_head list;
 };
 
 struct extent_map_tree {
 	struct rb_root map;
+	struct list_head modified_extents;
 	rwlock_t lock;
 };
 
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+	return !RB_EMPTY_NODE(&em->rb_node);
+}
+
 static inline u64 extent_map_end(struct extent_map *em)
 {
 	if (em->start + em->len < em->start)
@@ -49,18 +63,23 @@ static inline u64 extent_map_block_end(struct extent_map *em)
 	return em->block_start + em->block_len;
 }
 
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em);
+		       struct extent_map *em, int modified);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified);
 
-struct extent_map *alloc_extent_map(gfp_t mask);
+struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4f19a3e1bf3..f46cfe45d68 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -23,16 +23,19 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "volumes.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+				       PAGE_CACHE_SIZE))
+
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
 				   sizeof(struct btrfs_ordered_sum)) / \
-				   sizeof(struct btrfs_sector_sum) * \
-				   (r)->sectorsize - (r)->sectorsize)
+				   sizeof(u32) * (r)->sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
@@ -48,7 +51,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 	file_key.objectid = objectid;
 	file_key.offset = pos;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -58,7 +62,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 				      sizeof(*item));
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret);
+	BUG_ON(ret); /* Can't happen */
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
@@ -79,10 +83,11 @@ out:
 	return ret;
 }
 
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, int cow)
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+		  struct btrfs_root *root,
+		  struct btrfs_path *path,
+		  u64 bytenr, int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -90,8 +95,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int csums_in_item;
 
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -115,9 +119,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
 
-		if (csum_offset >= csums_in_item) {
+		if (csum_offset == csums_in_item) {
 			ret = -EFBIG;
 			goto fail;
+		} else if (csum_offset > csums_in_item) {
+			goto fail;
 		}
 	}
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -130,7 +136,6 @@ fail:
 	return ERR_PTR(ret);
 }
 
-
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -148,40 +153,79 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
+{
+	kfree(bio->csum_allocated);
+}
 
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 				   struct inode *inode, struct bio *bio,
 				   u64 logical_offset, u32 *dst, int dio)
 {
-	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
-	int bio_index = 0;
+	struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
+	u8 *csum;
 	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
 	u32 diff;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
-	int ret;
-	struct btrfs_path *path;
-	struct btrfs_csum_item *item = NULL;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int nblocks;
+	int bio_index = 0;
+	int count;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
-	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+	if (!path)
+		return -ENOMEM;
+
+	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
+	if (!dst) {
+		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+			btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
+							    GFP_NOFS);
+			if (!btrfs_bio->csum_allocated) {
+				btrfs_free_path(path);
+				return -ENOMEM;
+			}
+			btrfs_bio->csum = btrfs_bio->csum_allocated;
+			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
+		} else {
+			btrfs_bio->csum = btrfs_bio->csum_inline;
+		}
+		csum = btrfs_bio->csum;
+	} else {
+		csum = (u8 *)dst;
+	}
+
+	if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
 		path->reada = 2;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
-	disk_bytenr = (u64)bio->bi_sector << 9;
+	/*
+	 * the free space stuff is only read when it hasn't been
+	 * updated in the current transaction.  So, we can safely
+	 * read from the commit root and sidestep a nasty deadlock
+	 * between reading the free space cache and updating the csum tree.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+	}
+
+	disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
 	if (dio)
 		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
 		if (!dio)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
-		if (ret == 0)
+		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
+					       (u32 *)csum, nblocks);
+		if (count)
 			goto found;
 
 		if (!item || disk_bytenr < item_start_offset ||
@@ -190,27 +234,24 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 			u32 item_size;
 
 			if (item)
-				btrfs_release_path(root, path);
+				btrfs_release_path(path);
 			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
 						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
-				ret = PTR_ERR(item);
-				if (ret == -ENOENT || ret == -EFBIG)
-					ret = 0;
-				sum = 0;
+				count = 1;
+				memset(csum, 0, csum_size);
 				if (BTRFS_I(inode)->root->root_key.objectid ==
 				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 					set_extent_bits(io_tree, offset,
 						offset + bvec->bv_len - 1,
 						EXTENT_NODATASUM, GFP_NOFS);
 				} else {
-					printk(KERN_INFO "btrfs no csum found "
-					       "for inode %lu start %llu\n",
-					       inode->i_ino,
-					       (unsigned long long)offset);
+					btrfs_info(BTRFS_I(inode)->root->fs_info,
+						   "no csum found for inode %llu start %llu",
+					       btrfs_ino(inode), offset);
 				}
 				item = NULL;
-				btrfs_release_path(root, path);
+				btrfs_release_path(path);
 				goto found;
 			}
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
@@ -232,19 +273,20 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
-
-		read_extent_buffer(path->nodes[0], &sum,
+		count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
+					    inode->i_sb->s_blocksize_bits);
+		read_extent_buffer(path->nodes[0], csum,
 				   ((unsigned long)item) + diff,
-				   csum_size);
+				   csum_size * count);
 found:
-		if (dst)
-			*dst++ = sum;
-		else
-			set_state_private(io_tree, offset, sum);
-		disk_bytenr += bvec->bv_len;
-		offset += bvec->bv_len;
-		bio_index++;
-		bvec++;
+		csum += count * csum_size;
+		nblocks -= count;
+		bio_index += count;
+		while (count--) {
+			disk_bytenr += bvec->bv_len;
+			offset += bvec->bv_len;
+			bvec++;
+		}
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -257,28 +299,48 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 offset, u32 *dst)
+			      struct btrfs_dio_private *dip, struct bio *bio,
+			      u64 offset)
 {
-	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+	int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	int ret;
+
+	len >>= inode->i_sb->s_blocksize_bits;
+	len *= csum_size;
+
+	ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
+				      (u32 *)(dip->csum + len), 1);
+	return ret;
 }
 
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-			     struct list_head *list)
+			     struct list_head *list, int search_commit)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_csum_item *item;
+	LIST_HEAD(tmplist);
 	unsigned long offset;
 	int ret;
 	size_t size;
 	u64 csum_end;
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+	ASSERT(start == ALIGN(start, root->sectorsize) &&
+	       (end + 1) == ALIGN(end + 1, root->sectorsize));
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
+	if (search_commit) {
+		path->skip_locking = 1;
+		path->reada = 2;
+		path->search_commit_root = 1;
+	}
 
 	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 	key.offset = start;
@@ -313,11 +375,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
-		    key.type != BTRFS_EXTENT_CSUM_KEY)
-			break;
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.offset > end)
+		    key.type != BTRFS_EXTENT_CSUM_KEY ||
+		    key.offset > end)
 			break;
 
 		if (key.offset > start)
@@ -335,37 +394,41 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 				      struct btrfs_csum_item);
 		while (start < csum_end) {
 			size = min_t(size_t, csum_end - start,
-					MAX_ORDERED_SUM_BYTES(root));
+				     MAX_ORDERED_SUM_BYTES(root));
 			sums = kzalloc(btrfs_ordered_sum_size(root, size),
-					GFP_NOFS);
-			BUG_ON(!sums);
+				       GFP_NOFS);
+			if (!sums) {
+				ret = -ENOMEM;
+				goto fail;
+			}
 
-			sector_sum = sums->sums;
 			sums->bytenr = start;
-			sums->len = size;
+			sums->len = (int)size;
 
 			offset = (start - key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 			offset *= csum_size;
+			size >>= root->fs_info->sb->s_blocksize_bits;
 
-			while (size > 0) {
-				read_extent_buffer(path->nodes[0],
-						&sector_sum->sum,
-						((unsigned long)item) +
-						offset, csum_size);
-				sector_sum->bytenr = start;
-
-				size -= root->sectorsize;
-				start += root->sectorsize;
-				offset += csum_size;
-				sector_sum++;
-			}
-			list_add_tail(&sums->list, list);
+			read_extent_buffer(path->nodes[0],
+					   sums->sums,
+					   ((unsigned long)item) + offset,
+					   csum_size * size);
+
+			start += root->sectorsize * size;
+			list_add_tail(&sums->list, &tmplist);
 		}
 		path->slots[0]++;
 	}
 	ret = 0;
 fail:
+	while (ret < 0 && !list_empty(&tmplist)) {
+		sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	list_splice_tail(&tmplist, list);
+
 	btrfs_free_path(path);
 	return ret;
 }
@@ -374,24 +437,22 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig)
 {
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_ordered_extent *ordered;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
+	int index;
 	unsigned long total_bytes = 0;
 	unsigned long this_sum_bytes = 0;
 	u64 offset;
-	u64 disk_bytenr;
 
 	WARN_ON(bio->bi_vcnt <= 0);
-	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+		       GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
 
-	sector_sum = sums->sums;
-	disk_bytenr = (u64)bio->bi_sector << 9;
-	sums->len = bio->bi_size;
+	sums->len = bio->bi_iter.bi_size;
 	INIT_LIST_HEAD(&sums->list);
 
 	if (contig)
@@ -400,49 +461,48 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
-	BUG_ON(!ordered);
-	sums->bytenr = ordered->start;
+	BUG_ON(!ordered); /* Logic error */
+	sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
+	index = 0;
 
 	while (bio_index < bio->bi_vcnt) {
 		if (!contig)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
-		if (!contig && (offset >= ordered->file_offset + ordered->len ||
-		    offset < ordered->file_offset)) {
+		if (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset) {
 			unsigned long bytes_left;
 			sums->len = this_sum_bytes;
 			this_sum_bytes = 0;
 			btrfs_add_ordered_sum(inode, ordered, sums);
 			btrfs_put_ordered_extent(ordered);
 
-			bytes_left = bio->bi_size - total_bytes;
+			bytes_left = bio->bi_iter.bi_size - total_bytes;
 
 			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
 				       GFP_NOFS);
-			BUG_ON(!sums);
-			sector_sum = sums->sums;
+			BUG_ON(!sums); /* -ENOMEM */
 			sums->len = bytes_left;
 			ordered = btrfs_lookup_ordered_extent(inode, offset);
-			BUG_ON(!ordered);
-			sums->bytenr = ordered->start;
+			BUG_ON(!ordered); /* Logic error */
+			sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
+				       total_bytes;
+			index = 0;
 		}
 
-		data = kmap_atomic(bvec->bv_page, KM_USER0);
-		sector_sum->sum = ~(u32)0;
-		sector_sum->sum = btrfs_csum_data(root,
-						  data + bvec->bv_offset,
-						  sector_sum->sum,
-						  bvec->bv_len);
-		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(sector_sum->sum,
-				 (char *)&sector_sum->sum);
-		sector_sum->bytenr = disk_bytenr;
-
-		sector_sum++;
+		data = kmap_atomic(bvec->bv_page);
+		sums->sums[index] = ~(u32)0;
+		sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+						    sums->sums[index],
+						    bvec->bv_len);
+		kunmap_atomic(data);
+		btrfs_csum_final(sums->sums[index],
+				 (char *)(sums->sums + index));
+
 		bio_index++;
+		index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
-		disk_bytenr += bvec->bv_len;
 		offset += bvec->bv_len;
 		bvec++;
 	}
@@ -463,19 +523,16 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
  * This calls btrfs_truncate_item with the correct args based on the
  * overlap, and fixes up the key as required.
  */
-static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct btrfs_key *key,
-				      u64 bytenr, u64 len)
+static noinline void truncate_one_csum(struct btrfs_root *root,
+				       struct btrfs_path *path,
+				       struct btrfs_key *key,
+				       u64 bytenr, u64 len)
 {
 	struct extent_buffer *leaf;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	u64 csum_end;
 	u64 end_byte = bytenr + len;
 	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
-	int ret;
 
 	leaf = path->nodes[0];
 	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
@@ -491,8 +548,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
 		 */
 		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
 		new_size *= csum_size;
-		ret = btrfs_truncate_item(trans, root, path, new_size, 1);
-		BUG_ON(ret);
+		btrfs_truncate_item(root, path, new_size, 1);
 	} else if (key->offset >= bytenr && csum_end > end_byte &&
 		   end_byte > key->offset) {
 		/*
@@ -504,16 +560,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
 		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
 		new_size *= csum_size;
 
-		ret = btrfs_truncate_item(trans, root, path, new_size, 0);
-		BUG_ON(ret);
+		btrfs_truncate_item(root, path, new_size, 0);
 
 		key->offset = end_byte;
-		ret = btrfs_set_item_key_safe(trans, root, path, key);
-		BUG_ON(ret);
+		btrfs_set_item_key_safe(root, path, key);
 	} else {
 		BUG();
 	}
-	return 0;
 }
 
 /*
@@ -529,8 +582,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	u64 csum_end;
 	struct extent_buffer *leaf;
 	int ret;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
 
 	root = root->fs_info->csum_root;
@@ -548,10 +600,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret > 0) {
 			if (path->slots[0] == 0)
-				goto out;
+				break;
 			path->slots[0]--;
 		} else if (ret < 0) {
-			goto out;
+			break;
 		}
 
 		leaf = path->nodes[0];
@@ -576,7 +628,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		/* delete the entire item, it is inside our range */
 		if (key.offset >= bytenr && csum_end <= end_byte) {
 			ret = btrfs_del_item(trans, root, path);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 			if (key.offset == bytenr)
 				break;
 		} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -618,64 +671,65 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 			 * item changed size or key
 			 */
 			ret = btrfs_split_item(trans, root, path, &key, offset);
-			BUG_ON(ret && ret != -EAGAIN);
+			if (ret && ret != -EAGAIN) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto out;
+			}
 
 			key.offset = end_byte - 1;
 		} else {
-			ret = truncate_one_csum(trans, root, path,
-						&key, bytenr, len);
-			BUG_ON(ret);
+			truncate_one_csum(root, path, &key, bytenr, len);
 			if (key.offset < bytenr)
 				break;
 		}
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 	}
+	ret = 0;
 out:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
 {
-	u64 bytenr;
-	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
-	u64 next_offset;
-	u64 total_bytes = 0;
-	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
+	u64 next_offset;
+	u64 total_bytes = 0;
 	u64 csum_offset;
-	struct btrfs_sector_sum *sector_sum;
+	u64 bytenr;
 	u32 nritems;
 	u32 ins_size;
-	char *eb_map;
-	char *eb_token;
-	unsigned long map_len;
-	unsigned long map_start;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int index = 0;
+	int found_next;
+	int ret;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	sector_sum = sums->sums;
+	if (!path)
+		return -ENOMEM;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
+	bytenr = sums->bytenr + total_bytes;
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-	file_key.offset = sector_sum->bytenr;
-	bytenr = sector_sum->bytenr;
+	file_key.offset = bytenr;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+	item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
 	if (!IS_ERR(item)) {
-		leaf = path->nodes[0];
 		ret = 0;
+		leaf = path->nodes[0];
+		item_end = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_csum_item);
+		item_end = (struct btrfs_csum_item *)((char *)item_end +
+			   btrfs_item_size_nr(leaf, path->slots[0]));
 		goto found;
 	}
 	ret = PTR_ERR(item);
@@ -696,7 +750,7 @@ again:
 		int slot = path->slots[0] + 1;
 		/* we didn't find a csum item, insert one */
 		nritems = btrfs_header_nritems(path->nodes[0]);
-		if (path->slots[0] >= nritems - 1) {
+		if (!nritems || (path->slots[0] >= nritems - 1)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 1)
 				found_next = 1;
@@ -719,7 +773,7 @@ again:
 	 * at this point, we know the tree has an item, but it isn't big
 	 * enough yet to put our csum in.  Grow it
 	 */
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
 				csum_size, 1);
 	if (ret < 0)
@@ -742,43 +796,48 @@ again:
 		goto insert;
 	}
 
-	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
 	    csum_size) {
-		u32 diff = (csum_offset + 1) * csum_size;
+		int extend_nr;
+		u64 tmp;
+		u32 diff;
+		u32 free_space;
 
-		/*
-		 * is the item big enough already?  we dropped our lock
-		 * before and need to recheck
-		 */
-		if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
-			goto csum;
+		if (btrfs_leaf_free_space(root, leaf) <
+				 sizeof(struct btrfs_item) + csum_size * 2)
+			goto insert;
+
+		free_space = btrfs_leaf_free_space(root, leaf) -
+					 sizeof(struct btrfs_item) - csum_size;
+		tmp = sums->len - total_bytes;
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		WARN_ON(tmp < 1);
+
+		extend_nr = max_t(int, 1, (int)tmp);
+		diff = (csum_offset + extend_nr) * csum_size;
+		diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
 
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size)
-			goto insert;
+		diff = min(free_space, diff);
+		diff /= csum_size;
+		diff *= csum_size;
 
-		ret = btrfs_extend_item(trans, root, path, diff);
-		BUG_ON(ret);
+		btrfs_extend_item(root, path, diff);
+		ret = 0;
 		goto csum;
 	}
 
 insert:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	csum_offset = 0;
 	if (found_next) {
-		u64 tmp = total_bytes + root->sectorsize;
-		u64 next_sector = sector_sum->bytenr;
-		struct btrfs_sector_sum *next = sector_sum + 1;
+		u64 tmp;
 
-		while (tmp < sums->len) {
-			if (next_sector + root->sectorsize != next->bytenr)
-				break;
-			tmp += root->sectorsize;
-			next_sector = next->bytenr;
-			next++;
-		}
-		tmp = min(tmp, next_offset - file_key.offset);
+		tmp = sums->len - total_bytes;
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = min(tmp, (next_offset - file_key.offset) >>
+					 root->fs_info->sb->s_blocksize_bits);
+
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
 		ins_size = csum_size * tmp;
@@ -791,63 +850,31 @@ insert:
 	path->leave_spinning = 0;
 	if (ret < 0)
 		goto fail_unlock;
-	if (ret != 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret != 0))
 		goto fail_unlock;
-	}
-csum:
 	leaf = path->nodes[0];
+csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	ret = 0;
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * csum_size);
 found:
-	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
-				      btrfs_item_size_nr(leaf, path->slots[0]));
-	eb_token = NULL;
-next_sector:
-
-	if (!eb_token ||
-	   (unsigned long)item + csum_size >= map_start + map_len) {
-		int err;
-
-		if (eb_token)
-			unmap_extent_buffer(leaf, eb_token, KM_USER1);
-		eb_token = NULL;
-		err = map_private_extent_buffer(leaf, (unsigned long)item,
-						csum_size,
-						&eb_token, &eb_map,
-						&map_start, &map_len, KM_USER1);
-		if (err)
-			eb_token = NULL;
-	}
-	if (eb_token) {
-		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       &sector_sum->sum, csum_size);
-	} else {
-		write_extent_buffer(leaf, &sector_sum->sum,
-				    (unsigned long)item, csum_size);
-	}
+	ins_size = (u32)(sums->len - total_bytes) >>
+		   root->fs_info->sb->s_blocksize_bits;
+	ins_size *= csum_size;
+	ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+			      ins_size);
+	write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+			    ins_size);
+
+	ins_size /= csum_size;
+	total_bytes += ins_size * root->sectorsize;
+	index += ins_size;
 
-	total_bytes += root->sectorsize;
-	sector_sum++;
-	if (total_bytes < sums->len) {
-		item = (struct btrfs_csum_item *)((char *)item +
-						  csum_size);
-		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
-		    sector_sum->bytenr) {
-			bytenr = sector_sum->bytenr;
-			goto next_sector;
-		}
-	}
-	if (eb_token) {
-		unmap_extent_buffer(leaf, eb_token, KM_USER1);
-		eb_token = NULL;
-	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (total_bytes < sums->len) {
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		cond_resched();
 		goto again;
 	}
@@ -858,3 +885,79 @@ out:
 fail_unlock:
 	goto out;
 }
+
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+				     const struct btrfs_path *path,
+				     struct btrfs_file_extent_item *fi,
+				     const bool new_inline,
+				     struct extent_map *em)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf = path->nodes[0];
+	const int slot = path->slots[0];
+	struct btrfs_key key;
+	u64 extent_start, extent_end;
+	u64 bytenr;
+	u8 type = btrfs_file_extent_type(leaf, fi);
+	int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	extent_start = key.offset;
+
+	if (type == BTRFS_FILE_EXTENT_REG ||
+	    type == BTRFS_FILE_EXTENT_PREALLOC) {
+		extent_end = extent_start +
+			btrfs_file_extent_num_bytes(leaf, fi);
+	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+		size_t size;
+		size = btrfs_file_extent_inline_len(leaf, slot, fi);
+		extent_end = ALIGN(extent_start + size, root->sectorsize);
+	}
+
+	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+	if (type == BTRFS_FILE_EXTENT_REG ||
+	    type == BTRFS_FILE_EXTENT_PREALLOC) {
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
+		em->orig_start = extent_start -
+			btrfs_file_extent_offset(leaf, fi);
+		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		if (bytenr == 0) {
+			em->block_start = EXTENT_MAP_HOLE;
+			return;
+		}
+		if (compress_type != BTRFS_COMPRESS_NONE) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+			em->block_start = bytenr;
+			em->block_len = em->orig_block_len;
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, fi);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+			if (type == BTRFS_FILE_EXTENT_PREALLOC)
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		}
+	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+		em->block_start = EXTENT_MAP_INLINE;
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
+		/*
+		 * Initialize orig_start and block_len with the same values
+		 * as in inode.c:btrfs_get_extent().
+		 */
+		em->orig_start = EXTENT_MAP_HOLE;
+		em->block_len = (u64)-1;
+		if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->compress_type = compress_type;
+		}
+	} else {
+		btrfs_err(root->fs_info,
+			  "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
+			  type, btrfs_ino(inode), extent_start,
+			  root->root_key.objectid);
+	}
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f447b783bb8..1f2b99cb55e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,35 +24,401 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/slab.h>
+#include <linux/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
-#include "compat.h"
+#include "volumes.h"
+#include "qgroup.h"
 
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+	struct rb_node rb_node;
+	/* objectid */
+	u64 ino;
+	/*
+	 * transid where the defrag was added, we search for
+	 * extents newer than this
+	 */
+	u64 transid;
+
+	/* root objectid */
+	u64 root;
+
+	/* last offset we were able to defrag */
+	u64 last_offset;
+
+	/* if we've wrapped around back to zero once already */
+	int cycled;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+				  struct inode_defrag *defrag2)
+{
+	if (defrag1->root > defrag2->root)
+		return 1;
+	else if (defrag1->root < defrag2->root)
+		return -1;
+	else if (defrag1->ino > defrag2->ino)
+		return 1;
+	else if (defrag1->ino < defrag2->ino)
+		return -1;
+	else
+		return 0;
+}
+
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct inode_defrag *entry;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	int ret;
+
+	p = &root->fs_info->defrag_inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		ret = __compare_inode_defrag(defrag, entry);
+		if (ret < 0)
+			p = &parent->rb_left;
+		else if (ret > 0)
+			p = &parent->rb_right;
+		else {
+			/* if we're reinserting an entry for
+			 * an old defrag run, make sure to
+			 * lower the transid of our existing record
+			 */
+			if (defrag->transid < entry->transid)
+				entry->transid = defrag->transid;
+			if (defrag->last_offset > entry->last_offset)
+				entry->last_offset = defrag->last_offset;
+			return -EEXIST;
+		}
+	}
+	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+	rb_link_node(&defrag->rb_node, parent, p);
+	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+	return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+	if (!btrfs_test_opt(root, AUTO_DEFRAG))
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct inode_defrag *defrag;
+	u64 transid;
+	int ret;
+
+	if (!__need_auto_defrag(root))
+		return 0;
+
+	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
+		return 0;
+
+	if (trans)
+		transid = trans->transid;
+	else
+		transid = BTRFS_I(inode)->root->last_trans;
+
+	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+	if (!defrag)
+		return -ENOMEM;
+
+	defrag->ino = btrfs_ino(inode);
+	defrag->transid = transid;
+	defrag->root = root->root_key.objectid;
+
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+		/*
+		 * If we set IN_DEFRAG flag and evict the inode from memory,
+		 * and then re-read this inode, this new inode doesn't have
+		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+		 */
+		ret = __btrfs_add_inode_defrag(inode, defrag);
+		if (ret)
+			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	return 0;
+}
+
+/*
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
+ */
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+				       struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!__need_auto_defrag(root))
+		goto out;
+
+	/*
+	 * Here we don't check the IN_DEFRAG flag, because we need merge
+	 * them together.
+	 */
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	ret = __btrfs_add_inode_defrag(inode, defrag);
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	if (ret)
+		goto out;
+	return;
+out:
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+	struct inode_defrag *entry = NULL;
+	struct inode_defrag tmp;
+	struct rb_node *p;
+	struct rb_node *parent = NULL;
+	int ret;
+
+	tmp.ino = ino;
+	tmp.root = root;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	p = fs_info->defrag_inodes.rb_node;
+	while (p) {
+		parent = p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		ret = __compare_inode_defrag(&tmp, entry);
+		if (ret < 0)
+			p = parent->rb_left;
+		else if (ret > 0)
+			p = parent->rb_right;
+		else
+			goto out;
+	}
+
+	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+		parent = rb_next(parent);
+		if (parent)
+			entry = rb_entry(parent, struct inode_defrag, rb_node);
+		else
+			entry = NULL;
+	}
+out:
+	if (entry)
+		rb_erase(parent, &fs_info->defrag_inodes);
+	spin_unlock(&fs_info->defrag_inodes_lock);
+	return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	node = rb_first(&fs_info->defrag_inodes);
+	while (node) {
+		rb_erase(node, &fs_info->defrag_inodes);
+		defrag = rb_entry(node, struct inode_defrag, rb_node);
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+		if (need_resched()) {
+			spin_unlock(&fs_info->defrag_inodes_lock);
+			cond_resched();
+			spin_lock(&fs_info->defrag_inodes_lock);
+		}
+
+		node = rb_first(&fs_info->defrag_inodes);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH	1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_root *inode_root;
+	struct inode *inode;
+	struct btrfs_key key;
+	struct btrfs_ioctl_defrag_range_args range;
+	int num_defrag;
+	int index;
+	int ret;
+
+	/* get the inode */
+	key.objectid = defrag->root;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(inode_root)) {
+		ret = PTR_ERR(inode_root);
+		goto cleanup;
+	}
+
+	key.objectid = defrag->ino;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto cleanup;
+	}
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	/* do a chunk of defrag */
+	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+	memset(&range, 0, sizeof(range));
+	range.len = (u64)-1;
+	range.start = defrag->last_offset;
+
+	sb_start_write(fs_info->sb);
+	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+				       BTRFS_DEFRAG_BATCH);
+	sb_end_write(fs_info->sb);
+	/*
+	 * if we filled the whole defrag batch, there
+	 * must be more work to do.  Queue this defrag
+	 * again
+	 */
+	if (num_defrag == BTRFS_DEFRAG_BATCH) {
+		defrag->last_offset = range.start;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else if (defrag->last_offset && !defrag->cycled) {
+		/*
+		 * we didn't fill our defrag batch, but
+		 * we didn't start at zero.  Make sure we loop
+		 * around to the start of the file.
+		 */
+		defrag->last_offset = 0;
+		defrag->cycled = 1;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+
+	iput(inode);
+	return 0;
+cleanup:
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	return ret;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	u64 first_ino = 0;
+	u64 root_objectid = 0;
+
+	atomic_inc(&fs_info->defrag_running);
+	while (1) {
+		/* Pause the auto defragger. */
+		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+			     &fs_info->fs_state))
+			break;
+
+		if (!__need_auto_defrag(fs_info->tree_root))
+			break;
+
+		/* find an inode to defrag */
+		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+						 first_ino);
+		if (!defrag) {
+			if (root_objectid || first_ino) {
+				root_objectid = 0;
+				first_ino = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+
+		first_ino = defrag->ino + 1;
+		root_objectid = defrag->root;
+
+		__btrfs_run_defrag_inode(fs_info, defrag);
+	}
+	atomic_dec(&fs_info->defrag_running);
+
+	/*
+	 * during unmount, we use the transaction_wait queue to
+	 * wait for the defragger to stop
+	 */
+	wake_up(&fs_info->transaction_wait);
+	return 0;
+}
 
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-					 int write_bytes,
+					 size_t write_bytes,
 					 struct page **prepared_pages,
 					 struct iov_iter *i)
 {
 	size_t copied = 0;
+	size_t total_copied = 0;
 	int pg = 0;
 	int offset = pos & (PAGE_CACHE_SIZE - 1);
-	int total_copied = 0;
 
 	while (write_bytes > 0) {
 		size_t count = min_t(size_t,
@@ -60,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 		struct page *page = prepared_pages[pg];
 		/*
 		 * Copy data from userspace to the current page
-		 *
-		 * Disable pagefault to avoid recursive lock since
-		 * the pages are already locked
 		 */
-		pagefault_disable();
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
-		pagefault_enable();
 
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
@@ -87,10 +448,9 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 		write_bytes -= copied;
 		total_copied += copied;
 
-		/* Return to btrfs_file_aio_write to fault page */
-		if (unlikely(copied == 0)) {
+		/* Return to btrfs_file_write_iter to fault page */
+		if (unlikely(copied == 0))
 			break;
-		}
 
 		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
 			offset += copied;
@@ -105,19 +465,18 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
-		if (!pages[i])
-			break;
 		/* page checked is some magic around finding pages that
 		 * have been modified without going through btrfs_set_page_dirty
-		 * clear it here
+		 * clear it here. There should be no need to mark the pages
+		 * accessed as prepare_pages should have marked them accessed
+		 * in prepare_pages via find_or_create_page()
 		 */
 		ClearPageChecked(pages[i]);
 		unlock_page(pages[i]);
-		mark_page_accessed(pages[i]);
 		page_cache_release(pages[i]);
 	}
 }
@@ -130,17 +489,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
  * this also makes the decision about creating an inline extent vs
  * doing real data extents, marking pages dirty and delalloc as required.
  */
-static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct file *file,
-				   struct page **pages,
-				   size_t num_pages,
-				   loff_t pos,
-				   size_t write_bytes)
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+			     struct page **pages, size_t num_pages,
+			     loff_t pos, size_t write_bytes,
+			     struct extent_state **cached)
 {
 	int err = 0;
 	int i;
-	struct inode *inode = fdentry(file)->d_inode;
 	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
@@ -148,13 +503,13 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
-	num_bytes = (write_bytes + pos - start_pos +
-		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
 
 	end_of_last_block = start_pos + num_bytes - 1;
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-					NULL);
-	BUG_ON(err);
+					cached);
+	if (err)
+		return err;
 
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
@@ -162,13 +517,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		ClearPageChecked(p);
 		set_page_dirty(p);
 	}
-	if (end_pos > isize) {
+
+	/*
+	 * we've only changed i_size in ram, and we haven't updated
+	 * the disk i_size.  There is no need to log the inode
+	 * at this time.
+	 */
+	if (end_pos > isize)
 		i_size_write(inode, end_pos);
-		/* we've only changed i_size in ram, and we haven't updated
-		 * the disk i_size.  There is no need to log the inode
-		 * at this time.
-		 */
-	}
 	return 0;
 }
 
@@ -176,18 +532,20 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
  * this drops all the extents in the cache that intersect the range
  * [start, end].  Existing extents are split as required.
  */
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
+	u64 gen;
 	int ret;
 	int testend = 1;
 	unsigned long flags;
 	int compressed = 0;
+	bool modified;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -195,11 +553,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		testend = 0;
 	}
 	while (1) {
+		int no_splits = 0;
+
+		modified = false;
 		if (!split)
-			split = alloc_extent_map(GFP_NOFS);
+			split = alloc_extent_map();
 		if (!split2)
-			split2 = alloc_extent_map(GFP_NOFS);
-		BUG_ON(!split || !split2);
+			split2 = alloc_extent_map();
+		if (!split || !split2)
+			no_splits = 1;
 
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
@@ -208,6 +570,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			break;
 		}
 		flags = em->flags;
+		gen = em->generation;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 			if (testend && em->start + em->len >= start + len) {
 				free_extent_map(em);
@@ -223,31 +586,44 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		}
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-		remove_extent_mapping(em_tree, em);
+		clear_bit(EXTENT_FLAG_LOGGING, &flags);
+		modified = !list_empty(&em->list);
+		if (no_splits)
+			goto next;
 
-		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
-		    em->start < start) {
+		if (em->start < start) {
 			split->start = em->start;
 			split->len = start - em->start;
-			split->orig_start = em->orig_start;
-			split->block_start = em->block_start;
 
-			if (compressed)
-				split->block_len = em->block_len;
-			else
-				split->block_len = split->len;
+			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+				split->orig_start = em->orig_start;
+				split->block_start = em->block_start;
+
+				if (compressed)
+					split->block_len = em->block_len;
+				else
+					split->block_len = split->len;
+				split->orig_block_len = max(split->block_len,
+						em->orig_block_len);
+				split->ram_bytes = em->ram_bytes;
+			} else {
+				split->orig_start = split->start;
+				split->block_len = 0;
+				split->block_start = em->block_start;
+				split->orig_block_len = 0;
+				split->ram_bytes = split->len;
+			}
 
+			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
-			ret = add_extent_mapping(em_tree, split);
-			BUG_ON(ret);
+			replace_extent_mapping(em_tree, em, split, modified);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
 		}
-		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
-		    testend && em->start + em->len > start + len) {
+		if (testend && em->start + em->len > start + len) {
 			u64 diff = start + len - em->start;
 
 			split->start = start + len;
@@ -255,22 +631,45 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
-
-			if (compressed) {
-				split->block_len = em->block_len;
-				split->block_start = em->block_start;
-				split->orig_start = em->orig_start;
+			split->generation = gen;
+
+			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+				split->orig_block_len = max(em->block_len,
+						    em->orig_block_len);
+
+				split->ram_bytes = em->ram_bytes;
+				if (compressed) {
+					split->block_len = em->block_len;
+					split->block_start = em->block_start;
+					split->orig_start = em->orig_start;
+				} else {
+					split->block_len = split->len;
+					split->block_start = em->block_start
+						+ diff;
+					split->orig_start = em->orig_start;
+				}
 			} else {
-				split->block_len = split->len;
-				split->block_start = em->block_start + diff;
+				split->ram_bytes = split->len;
 				split->orig_start = split->start;
+				split->block_len = 0;
+				split->block_start = em->block_start;
+				split->orig_block_len = 0;
 			}
 
-			ret = add_extent_mapping(em_tree, split);
-			BUG_ON(ret);
+			if (extent_map_in_tree(em)) {
+				replace_extent_mapping(em_tree, em, split,
+						       modified);
+			} else {
+				ret = add_extent_mapping(em_tree, split,
+							 modified);
+				ASSERT(ret == 0); /* Logic error */
+			}
 			free_extent_map(split);
 			split = NULL;
 		}
+next:
+		if (extent_map_in_tree(em))
+			remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -282,7 +681,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		free_extent_map(split);
 	if (split2)
 		free_extent_map(split2);
-	return 0;
 }
 
 /*
@@ -294,15 +692,19 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
-	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key new_key;
+	u64 ino = btrfs_ino(inode);
 	u64 search_start = start;
 	u64 disk_bytenr = 0;
 	u64 num_bytes = 0;
@@ -313,28 +715,34 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 	int extent_type;
 	int recow;
 	int ret;
+	int modify_tree = -1;
+	int update_refs;
+	int found = 0;
+	int leafs_visited = 0;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+		modify_tree = 0;
 
+	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+		       root == root->fs_info->tree_root);
 	while (1) {
 		recow = 0;
-		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
-					       search_start, -1);
+		ret = btrfs_lookup_file_extent(trans, root, path, ino,
+					       search_start, modify_tree);
 		if (ret < 0)
 			break;
 		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
-			if (key.objectid == inode->i_ino &&
+			if (key.objectid == ino &&
 			    key.type == BTRFS_EXTENT_DATA_KEY)
 				path->slots[0]--;
 		}
 		ret = 0;
+		leafs_visited++;
 next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -346,12 +754,13 @@ next_slot:
 				ret = 0;
 				break;
 			}
+			leafs_visited++;
 			leaf = path->nodes[0];
 			recow = 1;
 		}
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid > inode->i_ino ||
+		if (key.objectid > ino ||
 		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
 			break;
 
@@ -368,20 +777,35 @@ next_slot:
 				btrfs_file_extent_num_bytes(leaf, fi);
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = key.offset +
-				btrfs_file_extent_inline_len(leaf, fi);
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
 		} else {
 			WARN_ON(1);
 			extent_end = search_start;
 		}
 
+		/*
+		 * Don't skip extent items representing 0 byte lengths. They
+		 * used to be created (bug) if while punching holes we hit
+		 * -ENOSPC condition. So if we find one here, just ensure we
+		 * delete it, otherwise we would insert a new file extent item
+		 * with the same key (offset) as that 0 bytes length file
+		 * extent item in the call to setup_items_for_insert() later
+		 * in this function.
+		 */
+		if (extent_end == key.offset && extent_end >= search_start)
+			goto delete_extent_item;
+
 		if (extent_end <= search_start) {
 			path->slots[0]++;
 			goto next_slot;
 		}
 
+		found = 1;
 		search_start = max(key.offset, start);
-		if (recow) {
-			btrfs_release_path(root, path);
+		if (recow || !modify_tree) {
+			modify_tree = -1;
+			btrfs_release_path(path);
 			continue;
 		}
 
@@ -391,14 +815,17 @@ next_slot:
 		 */
 		if (start > key.offset && end < extent_end) {
 			BUG_ON(del_nr > 0);
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = start;
 			ret = btrfs_duplicate_item(trans, root, path,
 						   &new_key);
 			if (ret == -EAGAIN) {
-				btrfs_release_path(root, path);
+				btrfs_release_path(path);
 				continue;
 			}
 			if (ret < 0)
@@ -419,14 +846,13 @@ next_slot:
 							extent_end - start);
 			btrfs_mark_buffer_dirty(leaf);
 
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset);
-				BUG_ON(ret);
-				*hint_byte = disk_bytenr;
+						start - extent_offset, 1);
+				BUG_ON(ret); /* -ENOMEM */
 			}
 			key.offset = start;
 		}
@@ -435,21 +861,22 @@ next_slot:
 		 *      | -------- extent -------- |
 		 */
 		if (start <= key.offset && end < extent_end) {
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 
 			extent_offset += end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, end - key.offset);
-				*hint_byte = disk_bytenr;
-			}
 			break;
 		}
 
@@ -460,15 +887,16 @@ next_slot:
 		 */
 		if (start > key.offset && end >= extent_end) {
 			BUG_ON(del_nr > 0);
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
 
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, extent_end - start);
-				*hint_byte = disk_bytenr;
-			}
 			if (end == extent_end)
 				break;
 
@@ -481,6 +909,7 @@ next_slot:
 		 *    | ------ extent ------ |
 		 */
 		if (start <= key.offset && end >= extent_end) {
+delete_extent_item:
 			if (del_nr == 0) {
 				del_slot = path->slots[0];
 				del_nr = 1;
@@ -489,21 +918,21 @@ next_slot:
 				del_nr++;
 			}
 
-			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			if (update_refs &&
+			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
 				extent_end = ALIGN(extent_end,
 						   root->sectorsize);
-			} else if (disk_bytenr > 0) {
+			} else if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset);
-				BUG_ON(ret);
+						extent_offset, 0);
+				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
-				*hint_byte = disk_bytenr;
 			}
 
 			if (end == extent_end)
@@ -516,23 +945,83 @@ next_slot:
 
 			ret = btrfs_del_items(trans, root, path, del_slot,
 					      del_nr);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				break;
+			}
 
 			del_nr = 0;
 			del_slot = 0;
 
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			continue;
 		}
 
 		BUG_ON(1);
 	}
 
-	if (del_nr > 0) {
+	if (!ret && del_nr > 0) {
+		/*
+		 * Set path->slots[0] to first slot, so that after the delete
+		 * if items are move off from our leaf to its immediate left or
+		 * right neighbor leafs, we end up with a correct and adjusted
+		 * path->slots[0] for our insertion (if replace_extent != 0).
+		 */
+		path->slots[0] = del_slot;
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-		BUG_ON(ret);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
 	}
 
+	leaf = path->nodes[0];
+	/*
+	 * If btrfs_del_items() was called, it might have deleted a leaf, in
+	 * which case it unlocked our path, so check path->locks[0] matches a
+	 * write lock.
+	 */
+	if (!ret && replace_extent && leafs_visited == 1 &&
+	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+	     path->locks[0] == BTRFS_WRITE_LOCK) &&
+	    btrfs_leaf_free_space(root, leaf) >=
+	    sizeof(struct btrfs_item) + extent_item_size) {
+
+		key.objectid = ino;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = start;
+		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+			struct btrfs_key slot_key;
+
+			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+				path->slots[0]++;
+		}
+		setup_items_for_insert(root, path, &key,
+				       &extent_item_size,
+				       extent_item_size,
+				       sizeof(struct btrfs_item) +
+				       extent_item_size, 1);
+		*key_inserted = 1;
+	}
+
+	if (!replace_extent || !(*key_inserted))
+		btrfs_release_path(path);
+	if (drop_end)
+		*drop_end = found ? min(end, extent_end) : end;
+	return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, int drop_cache)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+				   drop_cache, 0, 0, NULL);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -597,26 +1086,27 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	int del_slot = 0;
 	int recow;
 	int ret;
-
-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+	u64 ino = btrfs_ino(inode);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 again:
 	recow = 0;
 	split = start;
-	key.objectid = inode->i_ino;
+	key.objectid = ino;
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = split;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
 	if (ret > 0 && path->slots[0] > 0)
 		path->slots[0]--;
 
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-	BUG_ON(key.objectid != inode->i_ino ||
-	       key.type != BTRFS_EXTENT_DATA_KEY);
+	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
 	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -633,18 +1123,22 @@ again:
 		other_start = 0;
 		other_end = start;
 		if (extent_mergeable(leaf, path->slots[0] - 1,
-				     inode->i_ino, bytenr, orig_offset,
+				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     end - orig_offset);
 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
 			btrfs_mark_buffer_dirty(leaf);
@@ -656,18 +1150,22 @@ again:
 		other_start = end;
 		other_end = 0;
 		if (extent_mergeable(leaf, path->slots[0] + 1,
-				     inode->i_ino, bytenr, orig_offset,
+				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
@@ -684,20 +1182,25 @@ again:
 		new_key.offset = split;
 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
 		if (ret == -EAGAIN) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto again;
 		}
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 
 		leaf = path->nodes[0];
 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						split - key.offset);
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
@@ -705,8 +1208,8 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   inode->i_ino, orig_offset);
-		BUG_ON(ret);
+					   ino, orig_offset, 1);
+		BUG_ON(ret); /* -ENOMEM */
 
 		if (split == start) {
 			key.offset = start;
@@ -721,10 +1224,10 @@ again:
 	other_start = end;
 	other_end = 0;
 	if (extent_mergeable(leaf, path->slots[0] + 1,
-			     inode->i_ino, bytenr, orig_offset,
+			     ino, bytenr, orig_offset,
 			     &other_start, &other_end)) {
 		if (recow) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto again;
 		}
 		extent_end = other_end;
@@ -732,16 +1235,16 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					inode->i_ino, orig_offset);
-		BUG_ON(ret);
+					ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	other_start = 0;
 	other_end = start;
 	if (extent_mergeable(leaf, path->slots[0] - 1,
-			     inode->i_ino, bytenr, orig_offset,
+			     ino, bytenr, orig_offset,
 			     &other_start, &other_end)) {
 		if (recow) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto again;
 		}
 		key.offset = other_start;
@@ -749,26 +1252,31 @@ again:
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					inode->i_ino, orig_offset);
-		BUG_ON(ret);
+					ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	if (del_nr == 0) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
 		btrfs_mark_buffer_dirty(leaf);
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-		BUG_ON(ret);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 	}
 out:
 	btrfs_free_path(path);
@@ -779,11 +1287,13 @@ out:
  * on error we return an unlocked page and the error value
  * on success we return a locked page and 0
  */
-static int prepare_uptodate_page(struct page *page, u64 pos)
+static int prepare_uptodate_page(struct page *page, u64 pos,
+				 bool force_uptodate)
 {
 	int ret = 0;
 
-	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+	if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+	    !PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		if (ret)
 			return ret;
@@ -797,37 +1307,21 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
 }
 
 /*
- * this gets pages into the page cache and locks them down, it also properly
- * waits for data=ordered extents to finish before allowing the pages to be
- * modified.
+ * this just gets pages into the page cache and locks them down.
  */
-static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
-			 struct page **pages, size_t num_pages,
-			 loff_t pos, unsigned long first_index,
-			 unsigned long last_index, size_t write_bytes)
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+				  size_t num_pages, loff_t pos,
+				  size_t write_bytes, bool force_uptodate)
 {
-	struct extent_state *cached_state = NULL;
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
-	struct inode *inode = fdentry(file)->d_inode;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
-	int faili = 0;
-	u64 start_pos;
-	u64 last_pos;
-
-	start_pos = pos & ~((u64)root->sectorsize - 1);
-	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
-
-	if (start_pos > inode->i_size) {
-		err = btrfs_cont_expand(inode, start_pos);
-		if (err)
-			return err;
-	}
+	int faili;
 
-	memset(pages, 0, num_pages * sizeof(struct page *));
-again:
 	for (i = 0; i < num_pages; i++) {
-		pages[i] = grab_cache_page(inode->i_mapping, index + i);
+		pages[i] = find_or_create_page(inode->i_mapping, index + i,
+					       mask | __GFP_WRITE);
 		if (!pages[i]) {
 			faili = i - 1;
 			err = -ENOMEM;
@@ -835,10 +1329,11 @@ again:
 		}
 
 		if (i == 0)
-			err = prepare_uptodate_page(pages[i], pos);
+			err = prepare_uptodate_page(pages[i], pos,
+						    force_uptodate);
 		if (i == num_pages - 1)
 			err = prepare_uptodate_page(pages[i],
-						    pos + write_bytes);
+						    pos + write_bytes, false);
 		if (err) {
 			page_cache_release(pages[i]);
 			faili = i - 1;
@@ -846,206 +1341,242 @@ again:
 		}
 		wait_on_page_writeback(pages[i]);
 	}
-	err = 0;
+
+	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+				size_t num_pages, loff_t pos,
+				u64 *lockstart, u64 *lockend,
+				struct extent_state **cached_state)
+{
+	u64 start_pos;
+	u64 last_pos;
+	int i;
+	int ret = 0;
+
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
-				 start_pos, last_pos - 1, 0, &cached_state,
-				 GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    last_pos - 1);
+				 start_pos, last_pos, 0, cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, start_pos,
+						     last_pos - start_pos + 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
-		    ordered->file_offset < last_pos) {
-			btrfs_put_ordered_extent(ordered);
+		    ordered->file_offset <= last_pos) {
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     start_pos, last_pos - 1,
-					     &cached_state, GFP_NOFS);
+					     start_pos, last_pos,
+					     cached_state, GFP_NOFS);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
-			btrfs_wait_ordered_range(inode, start_pos,
-						 last_pos - start_pos);
-			goto again;
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			return -EAGAIN;
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
-				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-				  GFP_NOFS);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-				     start_pos, last_pos - 1, &cached_state,
-				     GFP_NOFS);
+				  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				  0, 0, cached_state, GFP_NOFS);
+		*lockstart = start_pos;
+		*lockend = last_pos;
+		ret = 1;
 	}
+
 	for (i = 0; i < num_pages; i++) {
-		clear_page_dirty_for_io(pages[i]);
+		if (clear_page_dirty_for_io(pages[i]))
+			account_page_redirty(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
-	return 0;
-fail:
-	while (faili >= 0) {
-		unlock_page(pages[faili]);
-		page_cache_release(pages[faili]);
-		faili--;
-	}
-	return err;
 
+	return ret;
 }
 
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-				    const struct iovec *iov,
-				    unsigned long nr_segs, loff_t pos)
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+				    size_t *write_bytes)
 {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page **pages = NULL;
-	struct iov_iter i;
-	loff_t *ppos = &iocb->ki_pos;
-	loff_t start_pos;
-	ssize_t num_written = 0;
-	ssize_t err = 0;
-	size_t count;
-	size_t ocount;
-	int ret = 0;
-	int nrptrs;
-	unsigned long first_index;
-	unsigned long last_index;
-	int will_write;
-	int buffered = 0;
-	int copied = 0;
-	int dirty_pages = 0;
-
-	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
-		      (file->f_flags & O_DIRECT));
-
-	start_pos = pos;
-
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
-	mutex_lock(&inode->i_mutex);
-
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err)
-		goto out;
-	count = ocount;
+	struct btrfs_ordered_extent *ordered;
+	u64 lockstart, lockend;
+	u64 num_bytes;
+	int ret;
 
-	current->backing_dev_info = inode->i_mapping->backing_dev_info;
-	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-	if (err)
-		goto out;
+	ret = btrfs_start_nocow_write(root);
+	if (!ret)
+		return -ENOSPC;
 
-	if (count == 0)
-		goto out;
+	lockstart = round_down(pos, root->sectorsize);
+	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 
-	err = file_remove_suid(file);
-	if (err)
-		goto out;
+	while (1) {
+		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered) {
+			break;
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
 
-	/*
-	 * If BTRFS flips readonly due to some impossible error
-	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
-	 * although we have opened a file as writable, we have
-	 * to stop this write operation to ensure FS consistency.
-	 */
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-		err = -EROFS;
-		goto out;
+	num_bytes = lockend - lockstart + 1;
+	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
+	if (ret <= 0) {
+		ret = 0;
+		btrfs_end_nocow_write(root);
+	} else {
+		*write_bytes = min_t(size_t, *write_bytes ,
+				     num_bytes - pos + lockstart);
 	}
 
-	file_update_time(file);
-	BTRFS_I(inode)->sequence++;
+	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 
-	if (unlikely(file->f_flags & O_DIRECT)) {
-		num_written = generic_file_direct_write(iocb, iov, &nr_segs,
-							pos, ppos, count,
-							ocount);
-		/*
-		 * the generic O_DIRECT will update in-memory i_size after the
-		 * DIOs are done.  But our endio handlers that update the on
-		 * disk i_size never update past the in memory i_size.  So we
-		 * need one more update here to catch any additions to the
-		 * file
-		 */
-		if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
-			btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-			mark_inode_dirty(inode);
-		}
+	return ret;
+}
 
-		if (num_written < 0) {
-			ret = num_written;
-			num_written = 0;
-			goto out;
-		} else if (num_written == count) {
-			/* pick up pos changes done by the generic code */
-			pos = *ppos;
-			goto out;
-		}
-		/*
-		 * We are going to do buffered for the rest of the range, so we
-		 * need to make sure to invalidate the buffered pages when we're
-		 * done.
-		 */
-		buffered = 1;
-		pos += num_written;
-	}
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
+					       struct iov_iter *i,
+					       loff_t pos)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page **pages = NULL;
+	struct extent_state *cached_state = NULL;
+	u64 release_bytes = 0;
+	u64 lockstart;
+	u64 lockend;
+	unsigned long first_index;
+	size_t num_written = 0;
+	int nrptrs;
+	int ret = 0;
+	bool only_release_metadata = false;
+	bool force_page_uptodate = false;
+	bool need_unlock;
 
-	iov_iter_init(&i, iov, nr_segs, count, num_written);
-	nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
 		     (sizeof(struct page *)));
+	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+	nrptrs = max(nrptrs, 8);
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
-	if (!pages) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	/* generic_write_checks can change our pos */
-	start_pos = pos;
+	if (!pages)
+		return -ENOMEM;
 
 	first_index = pos >> PAGE_CACHE_SHIFT;
-	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
 
-	while (iov_iter_count(&i) > 0) {
+	while (iov_iter_count(i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(iov_iter_count(&i),
+		size_t write_bytes = min(iov_iter_count(i),
 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
 					 offset);
 		size_t num_pages = (write_bytes + offset +
 				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		size_t reserve_bytes;
+		size_t dirty_pages;
+		size_t copied;
 
 		WARN_ON(num_pages > nrptrs);
-		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
 		/*
 		 * Fault pages before locking them in prepare_pages
 		 * to avoid recursive lock
 		 */
-		if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
 			ret = -EFAULT;
-			goto out;
+			break;
+		}
+
+		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+		ret = btrfs_check_data_free_space(inode, reserve_bytes);
+		if (ret == -ENOSPC &&
+		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC))) {
+			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret > 0) {
+				only_release_metadata = true;
+				/*
+				 * our prealloc extent may be smaller than
+				 * write_bytes, so scale down.
+				 */
+				num_pages = (write_bytes + offset +
+					     PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+				ret = 0;
+			} else {
+				ret = -ENOSPC;
+			}
 		}
 
-		ret = btrfs_delalloc_reserve_space(inode,
-					num_pages << PAGE_CACHE_SHIFT);
 		if (ret)
-			goto out;
+			break;
 
-		ret = prepare_pages(root, file, pages, num_pages,
-				    pos, first_index, last_index,
-				    write_bytes);
+		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
 		if (ret) {
-			btrfs_delalloc_release_space(inode,
-					num_pages << PAGE_CACHE_SHIFT);
-			goto out;
+			if (!only_release_metadata)
+				btrfs_free_reserved_data_space(inode,
+							       reserve_bytes);
+			else
+				btrfs_end_nocow_write(root);
+			break;
+		}
+
+		release_bytes = reserve_bytes;
+		need_unlock = false;
+again:
+		/*
+		 * This is going to setup the pages array with the number of
+		 * pages we want, so we don't really need to worry about the
+		 * contents of pages from loop to loop
+		 */
+		ret = prepare_pages(inode, pages, num_pages,
+				    pos, write_bytes,
+				    force_page_uptodate);
+		if (ret)
+			break;
+
+		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+						      pos, &lockstart, &lockend,
+						      &cached_state);
+		if (ret < 0) {
+			if (ret == -EAGAIN)
+				goto again;
+			break;
+		} else if (ret > 0) {
+			need_unlock = true;
+			ret = 0;
 		}
 
 		copied = btrfs_copy_from_user(pos, num_pages,
-					   write_bytes, pages, &i);
+					   write_bytes, pages, i);
 
 		/*
 		 * if we have trouble faulting in the pages, fall
@@ -1054,56 +1585,225 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		if (copied < write_bytes)
 			nrptrs = 1;
 
-		if (copied == 0)
+		if (copied == 0) {
+			force_page_uptodate = true;
 			dirty_pages = 0;
-		else
+		} else {
+			force_page_uptodate = false;
 			dirty_pages = (copied + offset +
 				       PAGE_CACHE_SIZE - 1) >>
 				       PAGE_CACHE_SHIFT;
+		}
 
+		/*
+		 * If we had a short copy we need to release the excess delaloc
+		 * bytes we reserved.  We need to increment outstanding_extents
+		 * because btrfs_delalloc_release_space will decrement it, but
+		 * we still have an outstanding extent for the chunk we actually
+		 * managed to copy.
+		 */
 		if (num_pages > dirty_pages) {
-			if (copied > 0)
-				atomic_inc(
-					&BTRFS_I(inode)->outstanding_extents);
-			btrfs_delalloc_release_space(inode,
-					(num_pages - dirty_pages) <<
-					PAGE_CACHE_SHIFT);
+			release_bytes = (num_pages - dirty_pages) <<
+				PAGE_CACHE_SHIFT;
+			if (copied > 0) {
+				spin_lock(&BTRFS_I(inode)->lock);
+				BTRFS_I(inode)->outstanding_extents++;
+				spin_unlock(&BTRFS_I(inode)->lock);
+			}
+			if (only_release_metadata)
+				btrfs_delalloc_release_metadata(inode,
+								release_bytes);
+			else
+				btrfs_delalloc_release_space(inode,
+							     release_bytes);
 		}
 
-		if (copied > 0) {
-			dirty_and_release_pages(NULL, root, file, pages,
-						dirty_pages, pos, copied);
+		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+
+		if (copied > 0)
+			ret = btrfs_dirty_pages(root, inode, pages,
+						dirty_pages, pos, copied,
+						NULL);
+		if (need_unlock)
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     lockstart, lockend, &cached_state,
+					     GFP_NOFS);
+		if (ret) {
+			btrfs_drop_pages(pages, num_pages);
+			break;
 		}
 
-		btrfs_drop_pages(pages, num_pages);
+		release_bytes = 0;
+		if (only_release_metadata)
+			btrfs_end_nocow_write(root);
 
-		if (copied > 0) {
-			if (will_write) {
-				filemap_fdatawrite_range(inode->i_mapping, pos,
-							 pos + copied - 1);
-			} else {
-				balance_dirty_pages_ratelimited_nr(
-							inode->i_mapping,
-							dirty_pages);
-				if (dirty_pages <
-				(root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-					btrfs_btree_balance_dirty(root, 1);
-				btrfs_throttle(root);
-			}
+		if (only_release_metadata && copied > 0) {
+			u64 lockstart = round_down(pos, root->sectorsize);
+			u64 lockend = lockstart +
+				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				       lockend, EXTENT_NORESERVE, NULL,
+				       NULL, GFP_NOFS);
+			only_release_metadata = false;
 		}
 
+		btrfs_drop_pages(pages, num_pages);
+
+		cond_resched();
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+			btrfs_btree_balance_dirty(root);
+
 		pos += copied;
 		num_written += copied;
+	}
 
-		cond_resched();
+	kfree(pages);
+
+	if (release_bytes) {
+		if (only_release_metadata) {
+			btrfs_end_nocow_write(root);
+			btrfs_delalloc_release_metadata(inode, release_bytes);
+		} else {
+			btrfs_delalloc_release_space(inode, release_bytes);
+		}
+	}
+
+	return num_written ? num_written : ret;
+}
+
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+				    struct iov_iter *from,
+				    loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t written;
+	ssize_t written_buffered;
+	loff_t endbyte;
+	int err;
+
+	written = generic_file_direct_write(iocb, from, pos);
+
+	if (written < 0 || !iov_iter_count(from))
+		return written;
+
+	pos += written;
+	written_buffered = __btrfs_buffered_write(file, from, pos);
+	if (written_buffered < 0) {
+		err = written_buffered;
+		goto out;
 	}
+	endbyte = pos + written_buffered - 1;
+	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+	if (err)
+		goto out;
+	written += written_buffered;
+	iocb->ki_pos = pos + written_buffered;
+	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+				 endbyte >> PAGE_CACHE_SHIFT);
 out:
-	mutex_unlock(&inode->i_mutex);
-	if (ret)
-		err = ret;
+	return written ? written : err;
+}
 
-	kfree(pages);
-	*ppos = pos;
+static void update_time_for_write(struct inode *inode)
+{
+	struct timespec now;
+
+	if (IS_NOCMTIME(inode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		inode->i_mtime = now;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		inode->i_ctime = now;
+
+	if (IS_I_VERSION(inode))
+		inode_inc_iversion(inode);
+}
+
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+				    struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start_pos;
+	u64 end_pos;
+	ssize_t num_written = 0;
+	ssize_t err = 0;
+	size_t count = iov_iter_count(from);
+	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+	loff_t pos = iocb->ki_pos;
+
+	mutex_lock(&inode->i_mutex);
+
+	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	if (count == 0) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	iov_iter_truncate(from, count);
+
+	err = file_remove_suid(file);
+	if (err) {
+		mutex_unlock(&inode->i_mutex);
+		goto out;
+	}
+
+	/*
+	 * If BTRFS flips readonly due to some impossible error
+	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+	 * although we have opened a file as writable, we have
+	 * to stop this write operation to ensure FS consistency.
+	 */
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+		mutex_unlock(&inode->i_mutex);
+		err = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * We reserve space for updating the inode when we reserve space for the
+	 * extent we are going to write, so we will enospc out there.  We don't
+	 * need to start yet another transaction to update the inode as we will
+	 * update the inode when we finish writing whatever data we write.
+	 */
+	update_time_for_write(inode);
+
+	start_pos = round_down(pos, root->sectorsize);
+	if (start_pos > i_size_read(inode)) {
+		/* Expand hole size to cover write data, preventing empty gap */
+		end_pos = round_up(pos + count, root->sectorsize);
+		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
+		if (err) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
+	if (sync)
+		atomic_inc(&BTRFS_I(inode)->sync_writers);
+
+	if (unlikely(file->f_flags & O_DIRECT)) {
+		num_written = __btrfs_direct_write(iocb, from, pos);
+	} else {
+		num_written = __btrfs_buffered_write(file, from, pos);
+		if (num_written > 0)
+			iocb->ki_pos = pos + num_written;
+	}
+
+	mutex_unlock(&inode->i_mutex);
 
 	/*
 	 * we want to make sure fsync finds this change
@@ -1116,45 +1816,22 @@ out:
 	 * this will either be one more than the running transaction
 	 * or the generation used for the next transaction if there isn't
 	 * one running right now.
+	 *
+	 * We also have to set last_sub_trans to the current log transid,
+	 * otherwise subsequent syncs to a file that's been synced in this
+	 * transaction will appear to have already occured.
 	 */
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
-
-	if (num_written > 0 && will_write) {
-		struct btrfs_trans_handle *trans;
-
-		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
-		if (err)
+	BTRFS_I(inode)->last_sub_trans = root->log_transid;
+	if (num_written > 0) {
+		err = generic_write_sync(file, pos, num_written);
+		if (err < 0)
 			num_written = err;
-
-		if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-			trans = btrfs_start_transaction(root, 0);
-			if (IS_ERR(trans)) {
-				num_written = PTR_ERR(trans);
-				goto done;
-			}
-			mutex_lock(&inode->i_mutex);
-			ret = btrfs_log_dentry_safe(trans, root,
-						    file->f_dentry);
-			mutex_unlock(&inode->i_mutex);
-			if (ret == 0) {
-				ret = btrfs_sync_log(trans, root);
-				if (ret == 0)
-					btrfs_end_transaction(trans, root);
-				else
-					btrfs_commit_transaction(trans, root);
-			} else if (ret != BTRFS_NO_LOG_SYNC) {
-				btrfs_commit_transaction(trans, root);
-			} else {
-				btrfs_end_transaction(trans, root);
-			}
-		}
-		if (file->f_flags & O_DIRECT && buffered) {
-			invalidate_mapping_pages(inode->i_mapping,
-			      start_pos >> PAGE_CACHE_SHIFT,
-			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
-		}
 	}
-done:
+
+	if (sync)
+		atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
@@ -1167,9 +1844,22 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 	 * flush down new bytes that may have been written if the
 	 * application were using truncate to replace a file in place.
 	 */
-	if (BTRFS_I(inode)->ordered_data_close) {
-		BTRFS_I(inode)->ordered_data_close = 0;
-		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+			       &BTRFS_I(inode)->runtime_flags)) {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		/*
+		 * We need to block on a committing transaction to keep us from
+		 * throwing a ordered operation on to the list and causing
+		 * something like sync to deadlock trying to flush out this
+		 * inode.
+		 */
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+		btrfs_end_transaction(trans, root);
 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 			filemap_flush(inode->i_mapping);
 	}
@@ -1189,41 +1879,81 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
  * important optimization for directories because holding the mutex prevents
  * new operations on the dir while we write to disk.
  */
-int btrfs_sync_file(struct file *file, int datasync)
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_log_ctx ctx;
+	int ret = 0;
+	bool full_sync = 0;
 
+	trace_btrfs_sync_file(file, datasync);
 
-	/* we wait first, since the writeback may change the inode */
-	root->log_batch++;
-	/* the VFS called filemap_fdatawrite for us */
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	root->log_batch++;
+	/*
+	 * We write the dirty pages in the range and wait until they complete
+	 * out of the ->i_mutex. If so, we can flush the dirty pages by
+	 * multi-task, and make the performance up.  See
+	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
+	 */
+	atomic_inc(&BTRFS_I(inode)->sync_writers);
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+	atomic_dec(&BTRFS_I(inode)->sync_writers);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * We flush the dirty pages again to avoid some dirty pages in the
+	 * range being left.
+	 */
+	atomic_inc(&root->log_batch);
+	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			     &BTRFS_I(inode)->runtime_flags);
+	if (full_sync) {
+		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+	atomic_inc(&root->log_batch);
 
 	/*
 	 * check the transaction that last modified this inode
 	 * and see if its already been committed
 	 */
-	if (!BTRFS_I(inode)->last_trans)
+	if (!BTRFS_I(inode)->last_trans) {
+		mutex_unlock(&inode->i_mutex);
 		goto out;
+	}
 
 	/*
 	 * if the last transaction that changed this file was before
 	 * the current transaction, we can bail out now without any
 	 * syncing
 	 */
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (BTRFS_I(inode)->last_trans <=
+	smp_mb();
+	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+	    BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
-		mutex_unlock(&root->fs_info->trans_mutex);
+
+		/*
+		 * We'v had everything committed since the last time we were
+		 * modified so clear this flag in case it was set for whatever
+		 * reason, it's no longer relevant.
+		 */
+		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			  &BTRFS_I(inode)->runtime_flags);
+		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
@@ -1231,15 +1961,32 @@ int btrfs_sync_file(struct file *file, int datasync)
 	if (file->private_data)
 		btrfs_ioctl_trans_end(file);
 
+	/*
+	 * We use start here because we will need to wait on the IO to complete
+	 * in btrfs_sync_log, which could require joining a transaction (for
+	 * example checking cross references in the nocow path).  If we use join
+	 * here we could get into a situation where we're waiting on IO to
+	 * happen that is blocked on a transaction trying to commit.  With start
+	 * we inc the extwriter counter, so we wait for all extwriters to exit
+	 * before we start blocking join'ers.  This comment is to keep somebody
+	 * from thinking they are super smart and changing this to
+	 * btrfs_join_transaction *cough*Josef*cough*.
+	 */
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
+		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
+	trans->sync = true;
 
-	ret = btrfs_log_dentry_safe(trans, root, dentry);
-	if (ret < 0)
-		goto out;
+	btrfs_init_log_ctx(&ctx);
+
+	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+	if (ret < 0) {
+		/* Fallthrough and commit/free transaction. */
+		ret = 1;
+	}
 
 	/* we've logged all the items and now have a consistent
 	 * version of the file in the log.  It is possible that
@@ -1251,29 +1998,37 @@ int btrfs_sync_file(struct file *file, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
-		if (ret > 0) {
-			ret = btrfs_commit_transaction(trans, root);
-		} else {
-			ret = btrfs_sync_log(trans, root);
-			if (ret == 0)
+		if (!ret) {
+			ret = btrfs_sync_log(trans, root, &ctx);
+			if (!ret) {
 				ret = btrfs_end_transaction(trans, root);
-			else
-				ret = btrfs_commit_transaction(trans, root);
+				goto out;
+			}
+		}
+		if (!full_sync) {
+			ret = btrfs_wait_ordered_range(inode, start,
+						       end - start + 1);
+			if (ret) {
+				btrfs_end_transaction(trans, root);
+				goto out;
+			}
 		}
+		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_end_transaction(trans, root);
 	}
-	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? -EIO : ret;
 }
 
 static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= btrfs_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
@@ -1285,38 +2040,474 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 
 	file_accessed(filp);
 	vma->vm_ops = &btrfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 
 	return 0;
 }
 
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+			  int slot, u64 start, u64 end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_disk_bytenr(leaf, fi))
+		return 0;
+
+	if (key.offset == end)
+		return 1;
+	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+		return 1;
+	return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+		      struct btrfs_path *path, u64 offset, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct extent_map *hole_em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key key;
+	int ret;
+
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+		goto out;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(!ret);
+
+	leaf = path->nodes[0];
+	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]--;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+			end - offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+
+	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]++;
+		key.offset = offset;
+		btrfs_set_item_key_safe(root, path, &key);
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+			offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, end - offset, 0, end - offset,
+				       0, 0, 0);
+	if (ret)
+		return ret;
+
+out:
+	btrfs_release_path(path);
+
+	hole_em = alloc_extent_map();
+	if (!hole_em) {
+		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+	} else {
+		hole_em->start = offset;
+		hole_em->len = end - offset;
+		hole_em->ram_bytes = hole_em->len;
+		hole_em->orig_start = offset;
+
+		hole_em->block_start = EXTENT_MAP_HOLE;
+		hole_em->block_len = 0;
+		hole_em->orig_block_len = 0;
+		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+		hole_em->compress_type = BTRFS_COMPRESS_NONE;
+		hole_em->generation = trans->transid;
+
+		do {
+			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, hole_em, 1);
+			write_unlock(&em_tree->lock);
+		} while (ret == -EEXIST);
+		free_extent_map(hole_em);
+		if (ret)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	}
+
+	return 0;
+}
+
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ *	   em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+	struct extent_map *em;
+	int ret = 0;
+
+	em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+	if (IS_ERR_OR_NULL(em)) {
+		if (!em)
+			ret = -ENOMEM;
+		else
+			ret = PTR_ERR(em);
+		return ret;
+	}
+
+	/* Hole or vacuum extent(only exists in no-hole mode) */
+	if (em->block_start == EXTENT_MAP_HOLE) {
+		ret = 1;
+		*len = em->start + em->len > *start + *len ?
+		       0 : *start + *len - em->start - em->len;
+		*start = em->start + em->len;
+	}
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *rsv;
+	struct btrfs_trans_handle *trans;
+	u64 lockstart;
+	u64 lockend;
+	u64 tail_start;
+	u64 tail_len;
+	u64 orig_start = offset;
+	u64 cur_offset;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+	u64 drop_end;
+	int ret = 0;
+	int err = 0;
+	int rsv_count;
+	bool same_page;
+	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+	u64 ino_size;
+
+	ret = btrfs_wait_ordered_range(inode, offset, len);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+	ret = find_first_non_hole(inode, &offset, &len);
+	if (ret < 0)
+		goto out_only_mutex;
+	if (ret && !len) {
+		/* Already in a large hole */
+		ret = 0;
+		goto out_only_mutex;
+	}
+
+	lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+	lockend = round_down(offset + len,
+			     BTRFS_I(inode)->root->sectorsize) - 1;
+	same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+		    ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
+	/*
+	 * We needn't truncate any page which is beyond the end of the file
+	 * because we are sure there is no data there.
+	 */
+	/*
+	 * Only do this if we are in the same page and we aren't doing the
+	 * entire page.
+	 */
+	if (same_page && len < PAGE_CACHE_SIZE) {
+		if (offset < ino_size)
+			ret = btrfs_truncate_page(inode, offset, len, 0);
+		goto out_only_mutex;
+	}
+
+	/* zero back part of the first page */
+	if (offset < ino_size) {
+		ret = btrfs_truncate_page(inode, offset, 0, 0);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	/* Check the aligned pages after the first unaligned page,
+	 * if offset != orig_start, which means the first unaligned page
+	 * including serveral following pages are already in holes,
+	 * the extra check can be skipped */
+	if (offset == orig_start) {
+		/* after truncate page, check hole again */
+		len = offset + len - lockstart;
+		offset = lockstart;
+		ret = find_first_non_hole(inode, &offset, &len);
+		if (ret < 0)
+			goto out_only_mutex;
+		if (ret && !len) {
+			ret = 0;
+			goto out_only_mutex;
+		}
+		lockstart = offset;
+	}
+
+	/* Check the tail unaligned part is in a hole */
+	tail_start = lockend + 1;
+	tail_len = offset + len - tail_start;
+	if (tail_len) {
+		ret = find_first_non_hole(inode, &tail_start, &tail_len);
+		if (unlikely(ret < 0))
+			goto out_only_mutex;
+		if (!ret) {
+			/* zero the front end of the last page */
+			if (tail_start + tail_len < ino_size) {
+				ret = btrfs_truncate_page(inode,
+						tail_start + tail_len, 0, 1);
+				if (ret)
+					goto out_only_mutex;
+				}
+		}
+	}
+
+	if (lockend < lockstart) {
+		mutex_unlock(&inode->i_mutex);
+		return 0;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+	rsv->failfast = 1;
+
+	/*
+	 * 1 - update the inode
+	 * 1 - removing the extents in the range
+	 * 1 - adding the hole extent if no_holes isn't set
+	 */
+	rsv_count = no_holes ? 2 : 3;
+	trans = btrfs_start_transaction(root, rsv_count);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
+	trans->block_rsv = rsv;
+
+	cur_offset = lockstart;
+	len = lockend - cur_offset;
+	while (cur_offset < lockend) {
+		ret = __btrfs_drop_extents(trans, root, inode, path,
+					   cur_offset, lockend + 1,
+					   &drop_end, 1, 0, 0, NULL);
+		if (ret != -ENOSPC)
+			break;
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+		if (cur_offset < ino_size) {
+			ret = fill_holes(trans, inode, path, cur_offset,
+					 drop_end);
+			if (ret) {
+				err = ret;
+				break;
+			}
+		}
+
+		cur_offset = drop_end;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
+
+		trans = btrfs_start_transaction(root, rsv_count);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+
+		ret = find_first_non_hole(inode, &cur_offset, &len);
+		if (unlikely(ret < 0))
+			break;
+		if (ret && !len) {
+			ret = 0;
+			break;
+		}
+	}
+
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	/*
+	 * Don't insert file hole extent item if it's for a range beyond eof
+	 * (because it's useless) or if it represents a 0 bytes range (when
+	 * cur_offset == drop_end).
+	 */
+	if (cur_offset < ino_size && cur_offset < drop_end) {
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			goto out_trans;
+		}
+	}
+
+out_trans:
+	if (!trans)
+		goto out_free;
+
+	inode_inc_iversion(inode);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+out_free:
+	btrfs_free_path(path);
+	btrfs_free_block_rsv(root, rsv);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+out_only_mutex:
+	mutex_unlock(&inode->i_mutex);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct extent_state *cached_state = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
 	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
+	int blocksize = BTRFS_I(inode)->root->sectorsize;
 	int ret;
 
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
+	alloc_start = round_down(offset, blocksize);
+	alloc_end = round_up(offset + len, blocksize);
 
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
+	/* Make sure we aren't being give some crap mode */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return btrfs_punch_hole(inode, offset, len);
+
 	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
+	 * Make sure we have enough space before we do the
+	 * allocation.
 	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		return ret;
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
+		if (ret)
+			goto out_reserve_fail;
+	}
 
 	mutex_lock(&inode->i_mutex);
 	ret = inode_newsize_ok(inode, alloc_end);
@@ -1324,12 +2515,27 @@ static long btrfs_fallocate(struct file *file, int mode,
 		goto out;
 
 	if (alloc_start > inode->i_size) {
-		ret = btrfs_cont_expand(inode, alloc_start);
+		ret = btrfs_cont_expand(inode, i_size_read(inode),
+					alloc_start);
+		if (ret)
+			goto out;
+	} else {
+		/*
+		 * If we are fallocating from the end of the file onward we
+		 * need to zero out the end of the page if i_size lands in the
+		 * middle of a page.
+		 */
+		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
 		if (ret)
 			goto out;
 	}
 
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	ret = btrfs_wait_ordered_range(inode, alloc_start,
+				       alloc_end - alloc_start);
 	if (ret)
 		goto out;
 
@@ -1341,7 +2547,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		 * transaction
 		 */
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state, GFP_NOFS);
+				 locked_end, 0, &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
@@ -1355,8 +2561,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
 			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
+			ret = btrfs_wait_ordered_range(inode, alloc_start,
+						       alloc_end - alloc_start);
+			if (ret)
+				goto out;
 		} else {
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
@@ -1366,11 +2574,21 @@ static long btrfs_fallocate(struct file *file, int mode,
 
 	cur_offset = alloc_start;
 	while (1) {
+		u64 actual_end;
+
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				      alloc_end - cur_offset, 0);
-		BUG_ON(IS_ERR(em) || !em);
+		if (IS_ERR_OR_NULL(em)) {
+			if (!em)
+				ret = -ENOMEM;
+			else
+				ret = PTR_ERR(em);
+			break;
+		}
 		last_byte = min(extent_map_end(em), alloc_end);
-		last_byte = (last_byte + mask) & ~mask;
+		actual_end = min_t(u64, extent_map_end(em), offset + len);
+		last_byte = ALIGN(last_byte, blocksize);
+
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1379,10 +2597,21 @@ static long btrfs_fallocate(struct file *file, int mode,
 							1 << inode->i_blkbits,
 							offset + len,
 							&alloc_hint);
+
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
 			}
+		} else if (actual_end > inode->i_size &&
+			   !(mode & FALLOC_FL_KEEP_SIZE)) {
+			/*
+			 * We didn't need to allocate any more space, but we
+			 * still extended the size of the file so we need to
+			 * update i_size.
+			 */
+			inode->i_ctime = CURRENT_TIME;
+			i_size_write(inode, actual_end);
+			btrfs_ordered_update_i_size(inode, actual_end, NULL);
 		}
 		free_extent_map(em);
 
@@ -1394,20 +2623,113 @@ static long btrfs_fallocate(struct file *file, int mode,
 	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (root->fs_info->quota_enabled)
+		btrfs_qgroup_free(root, alloc_end - alloc_start);
+out_reserve_fail:
+	/* Let go of our reservation. */
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 	return ret;
 }
 
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
+	u64 lockstart = *offset;
+	u64 lockend = i_size_read(inode);
+	u64 start = *offset;
+	u64 len = i_size_read(inode);
+	int ret = 0;
+
+	lockend = max_t(u64, root->sectorsize, lockend);
+	if (lockend <= lockstart)
+		lockend = lockstart + root->sectorsize;
+
+	lockend--;
+	len = lockend - lockstart + 1;
+
+	len = max_t(u64, len, root->sectorsize);
+	if (inode->i_size == 0)
+		return -ENXIO;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+			 &cached_state);
+
+	while (start < inode->i_size) {
+		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			em = NULL;
+			break;
+		}
+
+		if (whence == SEEK_HOLE &&
+		    (em->block_start == EXTENT_MAP_HOLE ||
+		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
+		else if (whence == SEEK_DATA &&
+			   (em->block_start != EXTENT_MAP_HOLE &&
+			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
+
+		start = em->start + em->len;
+		free_extent_map(em);
+		em = NULL;
+		cond_resched();
+	}
+	free_extent_map(em);
+	if (!ret) {
+		if (whence == SEEK_DATA && start >= inode->i_size)
+			ret = -ENXIO;
+		else
+			*offset = min_t(loff_t, start, inode->i_size);
+	}
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+	return ret;
+}
+
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	switch (whence) {
+	case SEEK_END:
+	case SEEK_CUR:
+		offset = generic_file_llseek(file, offset, whence);
+		goto out;
+	case SEEK_DATA:
+	case SEEK_HOLE:
+		if (offset >= i_size_read(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			return -ENXIO;
+		}
+
+		ret = find_desired_extent(inode, &offset, whence);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
 const struct file_operations btrfs_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read       = generic_file_aio_read,
+	.llseek		= btrfs_file_llseek,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter      = generic_file_read_iter,
 	.splice_read	= generic_file_splice_read,
-	.aio_write	= btrfs_file_aio_write,
+	.write_iter	= btrfs_file_write_iter,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
@@ -1418,3 +2740,21 @@ const struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
+
+void btrfs_auto_defrag_exit(void)
+{
+	if (btrfs_inode_defrag_cachep)
+		kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+					sizeof(struct inode_defrag), 0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_inode_defrag_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a0390657451..2b0a627cb5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,22 +20,25 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
 #include "disk-io.h"
+#include "extent_io.h"
+#include "inode-map.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
 
-static void recalculate_thresholds(struct btrfs_block_group_cache
-				   *block_group);
-static int link_free_space(struct btrfs_block_group_cache *block_group,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info);
 
-struct inode *lookup_free_space_inode(struct btrfs_root *root,
-				      struct btrfs_block_group_cache
-				      *block_group, struct btrfs_path *path)
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+					       struct btrfs_path *path,
+					       u64 offset)
 {
 	struct btrfs_key key;
 	struct btrfs_key location;
@@ -45,22 +48,15 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 	struct inode *inode = NULL;
 	int ret;
 
-	spin_lock(&block_group->lock);
-	if (block_group->inode)
-		inode = igrab(block_group->inode);
-	spin_unlock(&block_group->lock);
-	if (inode)
-		return inode;
-
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-	key.offset = block_group->key.objectid;
+	key.offset = offset;
 	key.type = 0;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		return ERR_PTR(ret);
 	if (ret > 0) {
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		return ERR_PTR(-ENOENT);
 	}
 
@@ -69,7 +65,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				struct btrfs_free_space_header);
 	btrfs_free_space_key(leaf, header, &disk_key);
 	btrfs_disk_key_to_cpu(&location, &disk_key);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
 	if (!inode)
@@ -81,8 +77,41 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		return ERR_PTR(-ENOENT);
 	}
 
+	mapping_set_gfp_mask(inode->i_mapping,
+			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+
+	return inode;
+}
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path)
+{
+	struct inode *inode = NULL;
+	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
+	spin_lock(&block_group->lock);
+	if (block_group->inode)
+		inode = igrab(block_group->inode);
+	spin_unlock(&block_group->lock);
+	if (inode)
+		return inode;
+
+	inode = __lookup_free_space_inode(root, path,
+					  block_group->key.objectid);
+	if (IS_ERR(inode))
+		return inode;
+
 	spin_lock(&block_group->lock);
-	if (!root->fs_info->closing) {
+	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
+		btrfs_info(root->fs_info,
+			"Old style space inode found, converting.");
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+			BTRFS_INODE_NODATACOW;
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+	}
+
+	if (!block_group->iref) {
 		block_group->inode = igrab(inode);
 		block_group->iref = 1;
 	}
@@ -91,27 +120,27 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 	return inode;
 }
 
-int create_free_space_inode(struct btrfs_root *root,
-			    struct btrfs_trans_handle *trans,
-			    struct btrfs_block_group_cache *block_group,
-			    struct btrfs_path *path)
+static int __create_free_space_inode(struct btrfs_root *root,
+				     struct btrfs_trans_handle *trans,
+				     struct btrfs_path *path,
+				     u64 ino, u64 offset)
 {
 	struct btrfs_key key;
 	struct btrfs_disk_key disk_key;
 	struct btrfs_free_space_header *header;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
-	u64 objectid;
+	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 	int ret;
 
-	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
-	if (ret < 0)
-		return ret;
-
-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	ret = btrfs_insert_empty_inode(trans, root, path, ino);
 	if (ret)
 		return ret;
 
+	/* We inline crc's for the free disk space cache */
+	if (ino != BTRFS_FREE_INO_OBJECTID)
+		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
@@ -124,23 +153,21 @@ int create_free_space_inode(struct btrfs_root *root,
 	btrfs_set_inode_uid(leaf, inode_item, 0);
 	btrfs_set_inode_gid(leaf, inode_item, 0);
 	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
-			      BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+	btrfs_set_inode_flags(leaf, inode_item, flags);
 	btrfs_set_inode_nlink(leaf, inode_item, 1);
 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
-	btrfs_set_inode_block_group(leaf, inode_item,
-				    block_group->key.objectid);
+	btrfs_set_inode_block_group(leaf, inode_item, offset);
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-	key.offset = block_group->key.objectid;
+	key.offset = offset;
 	key.type = 0;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(struct btrfs_free_space_header));
 	if (ret < 0) {
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		return ret;
 	}
 	leaf = path->nodes[0];
@@ -149,29 +176,54 @@ int create_free_space_inode(struct btrfs_root *root,
 	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
 	btrfs_set_free_space_key(leaf, header, &disk_key);
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	return 0;
 }
 
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path)
+{
+	int ret;
+	u64 ino;
+
+	ret = btrfs_find_free_objectid(root, &ino);
+	if (ret < 0)
+		return ret;
+
+	return __create_free_space_inode(root, trans, path, ino,
+					 block_group->key.objectid);
+}
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv)
+{
+	u64 needed_bytes;
+	int ret;
+
+	/* 1 for slack space, 1 for updating the inode */
+	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+		btrfs_calc_trans_metadata_size(root, 1);
+
+	spin_lock(&rsv->lock);
+	if (rsv->reserved < needed_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+	spin_unlock(&rsv->lock);
+	return ret;
+}
+
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
 				    struct inode *inode)
 {
-	loff_t oldsize;
 	int ret = 0;
 
-	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(trans, root,
-				    root->orphan_block_rsv,
-				    0, 5);
-	if (ret)
-		return ret;
-
-	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
-	truncate_pagecache(inode, oldsize, 0);
+	truncate_pagecache(inode, 0);
 
 	/*
 	 * We don't need an orphan item because truncating the free space cache
@@ -180,11 +232,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
 	if (ret) {
-		WARN_ON(1);
+		btrfs_abort_transaction(trans, root, ret);
 		return ret;
 	}
 
-	return btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+
+	return ret;
 }
 
 static int readahead_cache(struct inode *inode)
@@ -206,604 +262,1017 @@ static int readahead_cache(struct inode *inode)
 	return 0;
 }
 
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
-			  struct btrfs_block_group_cache *block_group)
+struct io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_root *root;
+	unsigned long size;
+	int index;
+	int num_pages;
+	unsigned check_crcs:1;
+};
+
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+		       struct btrfs_root *root, int write)
+{
+	int num_pages;
+	int check_crcs = 0;
+
+	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+		    PAGE_CACHE_SHIFT;
+
+	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+		check_crcs = 1;
+
+	/* Make sure we can fit our crcs into the first page */
+	if (write && check_crcs &&
+	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+		return -ENOSPC;
+
+	memset(io_ctl, 0, sizeof(struct io_ctl));
+
+	io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+	if (!io_ctl->pages)
+		return -ENOMEM;
+
+	io_ctl->num_pages = num_pages;
+	io_ctl->root = root;
+	io_ctl->check_crcs = check_crcs;
+
+	return 0;
+}
+
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+	kfree(io_ctl->pages);
+}
+
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+	if (io_ctl->cur) {
+		kunmap(io_ctl->page);
+		io_ctl->cur = NULL;
+		io_ctl->orig = NULL;
+	}
+}
+
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+	ASSERT(io_ctl->index < io_ctl->num_pages);
+	io_ctl->page = io_ctl->pages[io_ctl->index++];
+	io_ctl->cur = kmap(io_ctl->page);
+	io_ctl->orig = io_ctl->cur;
+	io_ctl->size = PAGE_CACHE_SIZE;
+	if (clear)
+		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+	int i;
+
+	io_ctl_unmap_page(io_ctl);
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		if (io_ctl->pages[i]) {
+			ClearPageChecked(io_ctl->pages[i]);
+			unlock_page(io_ctl->pages[i]);
+			page_cache_release(io_ctl->pages[i]);
+		}
+	}
+}
+
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+				int uptodate)
 {
-	struct btrfs_root *root = fs_info->tree_root;
-	struct inode *inode;
-	struct btrfs_free_space_header *header;
-	struct extent_buffer *leaf;
 	struct page *page;
-	struct btrfs_path *path;
-	u32 *checksums = NULL, *crc;
-	char *disk_crcs = NULL;
-	struct btrfs_key key;
-	struct list_head bitmaps;
-	u64 num_entries;
-	u64 num_bitmaps;
-	u64 generation;
-	u32 cur_crc = ~(u32)0;
-	pgoff_t index = 0;
-	unsigned long first_page_offset;
-	int num_checksums;
-	int ret = 0;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	int i;
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		page = find_or_create_page(inode->i_mapping, i, mask);
+		if (!page) {
+			io_ctl_drop_pages(io_ctl);
+			return -ENOMEM;
+		}
+		io_ctl->pages[i] = page;
+		if (uptodate && !PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				btrfs_err(BTRFS_I(inode)->root->fs_info,
+					   "error reading free space cache");
+				io_ctl_drop_pages(io_ctl);
+				return -EIO;
+			}
+		}
+	}
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		clear_page_dirty_for_io(io_ctl->pages[i]);
+		set_page_extent_mapped(io_ctl->pages[i]);
+	}
+
+	return 0;
+}
+
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+	__le64 *val;
+
+	io_ctl_map_page(io_ctl, 1);
 
 	/*
-	 * If we're unmounting then just return, since this does a search on the
-	 * normal root and not the commit root and we could deadlock.
+	 * Skip the csum areas.  If we don't check crcs then we just have a
+	 * 64bit chunk at the front of the first page.
 	 */
-	smp_mb();
-	if (fs_info->closing)
-		return 0;
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
+
+	val = io_ctl->cur;
+	*val = cpu_to_le64(generation);
+	io_ctl->cur += sizeof(u64);
+}
+
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+	__le64 *gen;
 
 	/*
-	 * If this block group has been marked to be cleared for one reason or
-	 * another then we can't trust the on disk cache, so just return.
+	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
+	 * chunk at the front of the first page.
 	 */
-	spin_lock(&block_group->lock);
-	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
-		spin_unlock(&block_group->lock);
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+		io_ctl->size -= sizeof(u64) +
+			(sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
+
+	gen = io_ctl->cur;
+	if (le64_to_cpu(*gen) != generation) {
+		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
+				   "(%Lu) does not match inode (%Lu)\n", *gen,
+				   generation);
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+	io_ctl->cur += sizeof(u64);
+	return 0;
+}
+
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+	u32 *tmp;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_unmap_page(io_ctl);
+		return;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;
+
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	io_ctl_unmap_page(io_ctl);
+	tmp = kmap(io_ctl->pages[0]);
+	tmp += index;
+	*tmp = crc;
+	kunmap(io_ctl->pages[0]);
+}
+
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+	u32 *tmp, val;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_map_page(io_ctl, 0);
 		return 0;
 	}
-	spin_unlock(&block_group->lock);
 
-	INIT_LIST_HEAD(&bitmaps);
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;
 
-	path = btrfs_alloc_path();
-	if (!path)
+	tmp = kmap(io_ctl->pages[0]);
+	tmp += index;
+	val = *tmp;
+	kunmap(io_ctl->pages[0]);
+
+	io_ctl_map_page(io_ctl, 0);
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	if (val != crc) {
+		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
+				   "space cache\n");
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+			    void *bitmap)
+{
+	struct btrfs_free_space_entry *entry;
+
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	entry = io_ctl->cur;
+	entry->offset = cpu_to_le64(offset);
+	entry->bytes = cpu_to_le64(bytes);
+	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+		BTRFS_FREE_SPACE_EXTENT;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 		return 0;
 
-	inode = lookup_free_space_inode(root, block_group, path);
-	if (IS_ERR(inode)) {
-		btrfs_free_path(path);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+
+	/* No more pages to map */
+	if (io_ctl->index >= io_ctl->num_pages)
 		return 0;
+
+	/* map the next page */
+	io_ctl_map_page(io_ctl, 1);
+	return 0;
+}
+
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	/*
+	 * If we aren't at the start of the current page, unmap this one and
+	 * map the next one if there is any left.
+	 */
+	if (io_ctl->cur != io_ctl->orig) {
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+		if (io_ctl->index >= io_ctl->num_pages)
+			return -ENOSPC;
+		io_ctl_map_page(io_ctl, 0);
 	}
 
-	/* Nothing in the space cache, goodbye */
-	if (!i_size_read(inode)) {
-		btrfs_free_path(path);
-		goto out;
+	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	if (io_ctl->index < io_ctl->num_pages)
+		io_ctl_map_page(io_ctl, 0);
+	return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+	/*
+	 * If we're not on the boundary we know we've modified the page and we
+	 * need to crc the page.
+	 */
+	if (io_ctl->cur != io_ctl->orig)
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	else
+		io_ctl_unmap_page(io_ctl);
+
+	while (io_ctl->index < io_ctl->num_pages) {
+		io_ctl_map_page(io_ctl, 1);
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	}
+}
+
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+			    struct btrfs_free_space *entry, u8 *type)
+{
+	struct btrfs_free_space_entry *e;
+	int ret;
+
+	if (!io_ctl->cur) {
+		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+		if (ret)
+			return ret;
+	}
+
+	e = io_ctl->cur;
+	entry->offset = le64_to_cpu(e->offset);
+	entry->bytes = le64_to_cpu(e->bytes);
+	*type = e->type;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return 0;
+
+	io_ctl_unmap_page(io_ctl);
+
+	return 0;
+}
+
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+			      struct btrfs_free_space *entry)
+{
+	int ret;
+
+	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+	if (ret)
+		return ret;
+
+	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+	io_ctl_unmap_page(io_ctl);
+
+	return 0;
+}
+
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries.  This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache.  So run through the space cache that we just
+ * loaded and merge contiguous entries.  This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *e, *prev = NULL;
+	struct rb_node *n;
+
+again:
+	spin_lock(&ctl->tree_lock);
+	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+		e = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (!prev)
+			goto next;
+		if (e->bitmap || prev->bitmap)
+			goto next;
+		if (prev->offset + prev->bytes == e->offset) {
+			unlink_free_space(ctl, prev);
+			unlink_free_space(ctl, e);
+			prev->bytes += e->bytes;
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			link_free_space(ctl, prev);
+			prev = NULL;
+			spin_unlock(&ctl->tree_lock);
+			goto again;
+		}
+next:
+		prev = e;
 	}
+	spin_unlock(&ctl->tree_lock);
+}
+
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_path *path, u64 offset)
+{
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct io_ctl io_ctl;
+	struct btrfs_key key;
+	struct btrfs_free_space *e, *n;
+	struct list_head bitmaps;
+	u64 num_entries;
+	u64 num_bitmaps;
+	u64 generation;
+	u8 type;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&bitmaps);
+
+	/* Nothing in the space cache, goodbye */
+	if (!i_size_read(inode))
+		return 0;
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-	key.offset = block_group->key.objectid;
+	key.offset = offset;
 	key.type = 0;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret) {
-		btrfs_free_path(path);
-		goto out;
+	if (ret < 0)
+		return 0;
+	else if (ret > 0) {
+		btrfs_release_path(path);
+		return 0;
 	}
 
+	ret = -1;
+
 	leaf = path->nodes[0];
 	header = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_free_space_header);
 	num_entries = btrfs_free_space_entries(leaf, header);
 	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
 	generation = btrfs_free_space_generation(leaf, header);
-	btrfs_free_path(path);
+	btrfs_release_path(path);
+
+	if (!BTRFS_I(inode)->generation) {
+		btrfs_info(root->fs_info,
+			   "The free space cache file (%llu) is invalid. skip it\n",
+			   offset);
+		return 0;
+	}
 
 	if (BTRFS_I(inode)->generation != generation) {
-		printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
-		       " not match free space cache generation (%llu) for "
-		       "block group %llu\n",
-		       (unsigned long long)BTRFS_I(inode)->generation,
-		       (unsigned long long)generation,
-		       (unsigned long long)block_group->key.objectid);
-		goto free_cache;
+		btrfs_err(root->fs_info,
+			"free space inode generation (%llu) "
+			"did not match free space cache generation (%llu)",
+			BTRFS_I(inode)->generation, generation);
+		return 0;
 	}
 
 	if (!num_entries)
-		goto out;
+		return 0;
 
-	/* Setup everything for doing checksumming */
-	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
-	checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
-	if (!checksums)
-		goto out;
-	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
-	disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
-	if (!disk_crcs)
-		goto out;
+	ret = io_ctl_init(&io_ctl, inode, root, 0);
+	if (ret)
+		return ret;
 
 	ret = readahead_cache(inode);
-	if (ret) {
-		ret = 0;
+	if (ret)
 		goto out;
-	}
 
-	while (1) {
-		struct btrfs_free_space_entry *entry;
-		struct btrfs_free_space *e;
-		void *addr;
-		unsigned long offset = 0;
-		unsigned long start_offset = 0;
-		int need_loop = 0;
+	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+	if (ret)
+		goto out;
 
-		if (!num_entries && !num_bitmaps)
-			break;
+	ret = io_ctl_check_crc(&io_ctl, 0);
+	if (ret)
+		goto free_cache;
 
-		if (index == 0) {
-			start_offset = first_page_offset;
-			offset = start_offset;
-		}
+	ret = io_ctl_check_generation(&io_ctl, generation);
+	if (ret)
+		goto free_cache;
 
-		page = grab_cache_page(inode->i_mapping, index);
-		if (!page) {
-			ret = 0;
+	while (num_entries) {
+		e = kmem_cache_zalloc(btrfs_free_space_cachep,
+				      GFP_NOFS);
+		if (!e)
 			goto free_cache;
-		}
 
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				printk(KERN_ERR "btrfs: error reading free "
-				       "space cache: %llu\n",
-				       (unsigned long long)
-				       block_group->key.objectid);
-				goto free_cache;
-			}
-		}
-		addr = kmap(page);
-
-		if (index == 0) {
-			u64 *gen;
-
-			memcpy(disk_crcs, addr, first_page_offset);
-			gen = addr + (sizeof(u32) * num_checksums);
-			if (*gen != BTRFS_I(inode)->generation) {
-				printk(KERN_ERR "btrfs: space cache generation"
-				       " (%llu) does not match inode (%llu) "
-				       "for block group %llu\n",
-				       (unsigned long long)*gen,
-				       (unsigned long long)
-				       BTRFS_I(inode)->generation,
-				       (unsigned long long)
-				       block_group->key.objectid);
-				kunmap(page);
-				unlock_page(page);
-				page_cache_release(page);
-				goto free_cache;
-			}
-			crc = (u32 *)disk_crcs;
-		}
-		entry = addr + start_offset;
-
-		/* First lets check our crc before we do anything fun */
-		cur_crc = ~(u32)0;
-		cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
-					  PAGE_CACHE_SIZE - start_offset);
-		btrfs_csum_final(cur_crc, (char *)&cur_crc);
-		if (cur_crc != *crc) {
-			printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
-			       "block group %llu\n", index,
-			       (unsigned long long)block_group->key.objectid);
-			kunmap(page);
-			unlock_page(page);
-			page_cache_release(page);
+		ret = io_ctl_read_entry(&io_ctl, e, &type);
+		if (ret) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
 			goto free_cache;
 		}
-		crc++;
 
-		while (1) {
-			if (!num_entries)
-				break;
+		if (!e->bytes) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
+		}
 
-			need_loop = 1;
-			e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-			if (!e) {
-				kunmap(page);
-				unlock_page(page);
-				page_cache_release(page);
+		if (type == BTRFS_FREE_SPACE_EXTENT) {
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
+				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
-
-			e->offset = le64_to_cpu(entry->offset);
-			e->bytes = le64_to_cpu(entry->bytes);
-			if (!e->bytes) {
-				kunmap(page);
-				kfree(e);
-				unlock_page(page);
-				page_cache_release(page);
+		} else {
+			ASSERT(num_bitmaps);
+			num_bitmaps--;
+			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+			if (!e->bitmap) {
+				kmem_cache_free(
+					btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
-
-			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
-				spin_lock(&block_group->tree_lock);
-				ret = link_free_space(block_group, e);
-				spin_unlock(&block_group->tree_lock);
-				BUG_ON(ret);
-			} else {
-				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-				if (!e->bitmap) {
-					kunmap(page);
-					kfree(e);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-				spin_lock(&block_group->tree_lock);
-				ret = link_free_space(block_group, e);
-				block_group->total_bitmaps++;
-				recalculate_thresholds(block_group);
-				spin_unlock(&block_group->tree_lock);
-				list_add_tail(&e->list, &bitmaps);
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			ctl->total_bitmaps++;
+			ctl->op->recalc_thresholds(ctl);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
+				kmem_cache_free(btrfs_free_space_cachep, e);
+				goto free_cache;
 			}
-
-			num_entries--;
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				break;
-			entry++;
+			list_add_tail(&e->list, &bitmaps);
 		}
 
-		/*
-		 * We read an entry out of this page, we need to move on to the
-		 * next page.
-		 */
-		if (need_loop) {
-			kunmap(page);
-			goto next;
-		}
+		num_entries--;
+	}
 
-		/*
-		 * We add the bitmaps at the end of the entries in order that
-		 * the bitmap entries are added to the cache.
-		 */
-		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+	io_ctl_unmap_page(&io_ctl);
+
+	/*
+	 * We add the bitmaps at the end of the entries in order that
+	 * the bitmap entries are added to the cache.
+	 */
+	list_for_each_entry_safe(e, n, &bitmaps, list) {
 		list_del_init(&e->list);
-		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
-		kunmap(page);
-		num_bitmaps--;
-next:
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
+		ret = io_ctl_read_bitmap(&io_ctl, e);
+		if (ret)
+			goto free_cache;
 	}
 
+	io_ctl_drop_pages(&io_ctl);
+	merge_space_tree(ctl);
 	ret = 1;
 out:
-	kfree(checksums);
-	kfree(disk_crcs);
-	iput(inode);
+	io_ctl_free(&io_ctl);
 	return ret;
-
 free_cache:
-	/* This cache is bogus, make sure it gets cleared */
-	spin_lock(&block_group->lock);
-	block_group->disk_cache_state = BTRFS_DC_CLEAR;
-	spin_unlock(&block_group->lock);
-	btrfs_remove_free_space_cache(block_group);
+	io_ctl_drop_pages(&io_ctl);
+	__btrfs_remove_free_space_cache(ctl);
 	goto out;
 }
 
-int btrfs_write_out_cache(struct btrfs_root *root,
-			  struct btrfs_trans_handle *trans,
-			  struct btrfs_block_group_cache *block_group,
-			  struct btrfs_path *path)
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group)
 {
-	struct btrfs_free_space_header *header;
-	struct extent_buffer *leaf;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_root *root = fs_info->tree_root;
 	struct inode *inode;
-	struct rb_node *node;
-	struct list_head *pos, *n;
-	struct page *page;
-	struct extent_state *cached_state = NULL;
-	struct list_head bitmap_list;
-	struct btrfs_key key;
-	u64 bytes = 0;
-	u32 *crc, *checksums;
-	pgoff_t index = 0, last_index = 0;
-	unsigned long first_page_offset;
-	int num_checksums;
-	int entries = 0;
-	int bitmaps = 0;
+	struct btrfs_path *path;
 	int ret = 0;
+	bool matched;
+	u64 used = btrfs_block_group_used(&block_group->item);
 
-	root = root->fs_info->tree_root;
-
-	INIT_LIST_HEAD(&bitmap_list);
-
+	/*
+	 * If this block group has been marked to be cleared for one reason or
+	 * another then we can't trust the on disk cache, so just return.
+	 */
 	spin_lock(&block_group->lock);
-	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 		spin_unlock(&block_group->lock);
 		return 0;
 	}
 	spin_unlock(&block_group->lock);
 
-	inode = lookup_free_space_inode(root, block_group, path);
-	if (IS_ERR(inode))
+	path = btrfs_alloc_path();
+	if (!path)
 		return 0;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
-	if (!i_size_read(inode)) {
-		iput(inode);
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode)) {
+		btrfs_free_path(path);
 		return 0;
 	}
 
-	node = rb_first(&block_group->free_space_offset);
-	if (!node) {
-		iput(inode);
-		return 0;
+	/* We may have converted the inode and made the cache invalid. */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		btrfs_free_path(path);
+		goto out;
 	}
+	spin_unlock(&block_group->lock);
 
-	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
-	filemap_write_and_wait(inode->i_mapping);
-	btrfs_wait_ordered_range(inode, inode->i_size &
-				 ~(root->sectorsize - 1), (u64)-1);
-
-	/* We need a checksum per page. */
-	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
-	crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
-	if (!crc) {
-		iput(inode);
-		return 0;
-	}
+	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+				      path, block_group->key.objectid);
+	btrfs_free_path(path);
+	if (ret <= 0)
+		goto out;
 
-	/* Since the first page has all of our checksums and our generation we
-	 * need to calculate the offset into the page that we can start writing
-	 * our entries.
-	 */
-	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+	spin_lock(&ctl->tree_lock);
+	matched = (ctl->free_space == (block_group->key.offset - used -
+				       block_group->bytes_super));
+	spin_unlock(&ctl->tree_lock);
 
-	/*
-	 * Lock all pages first so we can lock the extent safely.
-	 *
-	 * NOTE: Because we hold the ref the entire time we're going to write to
-	 * the page find_get_page should never fail, so we don't do a check
-	 * after find_get_page at this point.  Just putting this here so people
-	 * know and don't freak out.
-	 */
-	while (index <= last_index) {
-		page = grab_cache_page(inode->i_mapping, index);
-		if (!page) {
-			pgoff_t i = 0;
-
-			while (i < index) {
-				page = find_get_page(inode->i_mapping, i);
-				unlock_page(page);
-				page_cache_release(page);
-				page_cache_release(page);
-				i++;
-			}
-			goto out_free;
-		}
-		index++;
+	if (!matched) {
+		__btrfs_remove_free_space_cache(ctl);
+		btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
+			block_group->key.objectid);
+		ret = -1;
 	}
+out:
+	if (ret < 0) {
+		/* This cache is bogus, make sure it gets cleared */
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+		spin_unlock(&block_group->lock);
+		ret = 0;
 
-	index = 0;
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
-			 0, &cached_state, GFP_NOFS);
-
-	/* Write out the extent entries */
-	do {
-		struct btrfs_free_space_entry *entry;
-		void *addr;
-		unsigned long offset = 0;
-		unsigned long start_offset = 0;
-
-		if (index == 0) {
-			start_offset = first_page_offset;
-			offset = start_offset;
-		}
-
-		page = find_get_page(inode->i_mapping, index);
+		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+			block_group->key.objectid);
+	}
 
-		addr = kmap(page);
-		entry = addr + start_offset;
+	iput(inode);
+	return ret;
+}
 
-		memset(addr, 0, PAGE_CACHE_SIZE);
-		while (1) {
-			struct btrfs_free_space *e;
+static noinline_for_stack
+int write_cache_extent_entries(struct io_ctl *io_ctl,
+			      struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_block_group_cache *block_group,
+			      int *entries, int *bitmaps,
+			      struct list_head *bitmap_list)
+{
+	int ret;
+	struct btrfs_free_cluster *cluster = NULL;
+	struct rb_node *node = rb_first(&ctl->free_space_offset);
 
-			e = rb_entry(node, struct btrfs_free_space, offset_index);
-			entries++;
+	/* Get the cluster for this block_group if it exists */
+	if (block_group && !list_empty(&block_group->cluster_list)) {
+		cluster = list_entry(block_group->cluster_list.next,
+				     struct btrfs_free_cluster,
+				     block_group_list);
+	}
 
-			entry->offset = cpu_to_le64(e->offset);
-			entry->bytes = cpu_to_le64(e->bytes);
-			if (e->bitmap) {
-				entry->type = BTRFS_FREE_SPACE_BITMAP;
-				list_add_tail(&e->list, &bitmap_list);
-				bitmaps++;
-			} else {
-				entry->type = BTRFS_FREE_SPACE_EXTENT;
-			}
-			node = rb_next(node);
-			if (!node)
-				break;
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				break;
-			entry++;
-		}
-		*crc = ~(u32)0;
-		*crc = btrfs_csum_data(root, addr + start_offset, *crc,
-				       PAGE_CACHE_SIZE - start_offset);
-		kunmap(page);
+	if (!node && cluster) {
+		node = rb_first(&cluster->root);
+		cluster = NULL;
+	}
 
-		btrfs_csum_final(*crc, (char *)crc);
-		crc++;
+	/* Write out the extent entries */
+	while (node) {
+		struct btrfs_free_space *e;
 
-		bytes += PAGE_CACHE_SIZE;
+		e = rb_entry(node, struct btrfs_free_space, offset_index);
+		*entries += 1;
 
-		ClearPageChecked(page);
-		set_page_extent_mapped(page);
-		SetPageUptodate(page);
-		set_page_dirty(page);
+		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
+				       e->bitmap);
+		if (ret)
+			goto fail;
 
-		/*
-		 * We need to release our reference we got for grab_cache_page,
-		 * except for the first page which will hold our checksums, we
-		 * do that below.
-		 */
-		if (index != 0) {
-			unlock_page(page);
-			page_cache_release(page);
+		if (e->bitmap) {
+			list_add_tail(&e->list, bitmap_list);
+			*bitmaps += 1;
+		}
+		node = rb_next(node);
+		if (!node && cluster) {
+			node = rb_first(&cluster->root);
+			cluster = NULL;
 		}
-
-		page_cache_release(page);
-
-		index++;
-	} while (node);
-
-	/* Write out the bitmaps */
-	list_for_each_safe(pos, n, &bitmap_list) {
-		void *addr;
-		struct btrfs_free_space *entry =
-			list_entry(pos, struct btrfs_free_space, list);
-
-		page = find_get_page(inode->i_mapping, index);
-
-		addr = kmap(page);
-		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
-		*crc = ~(u32)0;
-		*crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
-		kunmap(page);
-		btrfs_csum_final(*crc, (char *)crc);
-		crc++;
-		bytes += PAGE_CACHE_SIZE;
-
-		ClearPageChecked(page);
-		set_page_extent_mapped(page);
-		SetPageUptodate(page);
-		set_page_dirty(page);
-		unlock_page(page);
-		page_cache_release(page);
-		page_cache_release(page);
-		list_del_init(&entry->list);
-		index++;
-	}
-
-	/* Zero out the rest of the pages just to make sure */
-	while (index <= last_index) {
-		void *addr;
-
-		page = find_get_page(inode->i_mapping, index);
-
-		addr = kmap(page);
-		memset(addr, 0, PAGE_CACHE_SIZE);
-		kunmap(page);
-		ClearPageChecked(page);
-		set_page_extent_mapped(page);
-		SetPageUptodate(page);
-		set_page_dirty(page);
-		unlock_page(page);
-		page_cache_release(page);
-		page_cache_release(page);
-		bytes += PAGE_CACHE_SIZE;
-		index++;
-	}
-
-	btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
-
-	/* Write the checksums and trans id to the first page */
-	{
-		void *addr;
-		u64 *gen;
-
-		page = find_get_page(inode->i_mapping, 0);
-
-		addr = kmap(page);
-		memcpy(addr, checksums, sizeof(u32) * num_checksums);
-		gen = addr + (sizeof(u32) * num_checksums);
-		*gen = trans->transid;
-		kunmap(page);
-		ClearPageChecked(page);
-		set_page_extent_mapped(page);
-		SetPageUptodate(page);
-		set_page_dirty(page);
-		unlock_page(page);
-		page_cache_release(page);
-		page_cache_release(page);
 	}
-	BTRFS_I(inode)->generation = trans->transid;
-
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+	return 0;
+fail:
+	return -ENOSPC;
+}
 
-	filemap_write_and_wait(inode->i_mapping);
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+		  struct btrfs_root *root,
+		  struct inode *inode,
+		  struct btrfs_path *path, u64 offset,
+		  int entries, int bitmaps)
+{
+	struct btrfs_key key;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	int ret;
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-	key.offset = block_group->key.objectid;
+	key.offset = offset;
 	key.type = 0;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		ret = 0;
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
-		goto out_free;
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
+		goto fail;
 	}
 	leaf = path->nodes[0];
 	if (ret > 0) {
 		struct btrfs_key found_key;
-		BUG_ON(!path->slots[0]);
+		ASSERT(path->slots[0]);
 		path->slots[0]--;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
-		    found_key.offset != block_group->key.objectid) {
-			ret = 0;
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
-					 EXTENT_DIRTY | EXTENT_DELALLOC |
-					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
-					 GFP_NOFS);
-			btrfs_release_path(root, path);
-			goto out_free;
+		    found_key.offset != offset) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+					 inode->i_size - 1,
+					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+					 NULL, GFP_NOFS);
+			btrfs_release_path(path);
+			goto fail;
 		}
 	}
+
+	BTRFS_I(inode)->generation = trans->transid;
 	header = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_free_space_header);
 	btrfs_set_free_space_entries(leaf, header, entries);
 	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
 	btrfs_set_free_space_generation(leaf, header, trans->transid);
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-	ret = 1;
+	return 0;
 
-out_free:
-	if (ret == 0) {
-		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+fail:
+	return -1;
+}
+
+static noinline_for_stack int
+write_pinned_extent_entries(struct btrfs_root *root,
+			    struct btrfs_block_group_cache *block_group,
+			    struct io_ctl *io_ctl,
+			    int *entries)
+{
+	u64 start, extent_start, extent_end, len;
+	struct extent_io_tree *unpin = NULL;
+	int ret;
+
+	if (!block_group)
+		return 0;
+
+	/*
+	 * We want to add any pinned extents to our free space cache
+	 * so we don't leak the space
+	 *
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
+	start = block_group->key.objectid;
+
+	while (start < block_group->key.objectid + block_group->key.offset) {
+		ret = find_first_extent_bit(unpin, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY, NULL);
+		if (ret)
+			return 0;
+
+		/* This pinned extent is out of our range */
+		if (extent_start >= block_group->key.objectid +
+		    block_group->key.offset)
+			return 0;
+
+		extent_start = max(extent_start, start);
+		extent_end = min(block_group->key.objectid +
+				 block_group->key.offset, extent_end + 1);
+		len = extent_end - extent_start;
+
+		*entries += 1;
+		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
+		if (ret)
+			return -ENOSPC;
+
+		start = extent_end;
+	}
+
+	return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+	struct list_head *pos, *n;
+	int ret;
+
+	/* Write out the bitmaps */
+	list_for_each_safe(pos, n, bitmap_list) {
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+
+		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
+		if (ret)
+			return -ENOSPC;
+		list_del_init(&entry->list);
+	}
+
+	return 0;
+}
+
+static int flush_dirty_cache(struct inode *inode)
+{
+	int ret;
+
+	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (ret)
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
+
+	return ret;
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+			   struct io_ctl *io_ctl,
+			   struct extent_state **cached_state,
+			   struct list_head *bitmap_list)
+{
+	struct list_head *pos, *n;
+
+	list_for_each_safe(pos, n, bitmap_list) {
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+		list_del_init(&entry->list);
+	}
+	io_ctl_drop_pages(io_ctl);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, cached_state,
+			     GFP_NOFS);
+}
+
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path, u64 offset)
+{
+	struct extent_state *cached_state = NULL;
+	struct io_ctl io_ctl;
+	LIST_HEAD(bitmap_list);
+	int entries = 0;
+	int bitmaps = 0;
+	int ret;
+
+	if (!i_size_read(inode))
+		return -1;
+
+	ret = io_ctl_init(&io_ctl, inode, root, 1);
+	if (ret)
+		return -1;
+
+	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+		down_write(&block_group->data_rwsem);
 		spin_lock(&block_group->lock);
-		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		if (block_group->delalloc_bytes) {
+			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+			spin_unlock(&block_group->lock);
+			up_write(&block_group->data_rwsem);
+			BTRFS_I(inode)->generation = 0;
+			ret = 0;
+			goto out;
+		}
 		spin_unlock(&block_group->lock);
+	}
+
+	/* Lock all pages first so we can lock the extent safely. */
+	io_ctl_prepare_pages(&io_ctl, inode, 0);
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+			 0, &cached_state);
+
+	io_ctl_set_generation(&io_ctl, trans->transid);
+
+	/* Write out the extent entries in the free space cache */
+	ret = write_cache_extent_entries(&io_ctl, ctl,
+					 block_group, &entries, &bitmaps,
+					 &bitmap_list);
+	if (ret)
+		goto out_nospc;
+
+	/*
+	 * Some spaces that are freed in the current transaction are pinned,
+	 * they will be added into free space cache after the transaction is
+	 * committed, we shouldn't lose them.
+	 */
+	ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+	if (ret)
+		goto out_nospc;
+
+	/* At last, we write out all the bitmaps. */
+	ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+	if (ret)
+		goto out_nospc;
+
+	/* Zero out the rest of the pages just to make sure */
+	io_ctl_zero_remaining_pages(&io_ctl);
+
+	/* Everything is written out, now we dirty the pages in the file. */
+	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+				0, i_size_read(inode), &cached_state);
+	if (ret)
+		goto out_nospc;
+
+	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+		up_write(&block_group->data_rwsem);
+	/*
+	 * Release the pages and unlock the extent, we will flush
+	 * them out later
+	 */
+	io_ctl_drop_pages(&io_ctl);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+	/* Flush the dirty pages in the cache file. */
+	ret = flush_dirty_cache(inode);
+	if (ret)
+		goto out;
+
+	/* Update the cache item to tell everyone this cache file is valid. */
+	ret = update_cache_item(trans, root, inode, path, offset,
+				entries, bitmaps);
+out:
+	io_ctl_free(&io_ctl);
+	if (ret) {
+		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
-	kfree(checksums);
 	btrfs_update_inode(trans, root, inode);
+	return ret;
+
+out_nospc:
+	cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+
+	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+		up_write(&block_group->data_rwsem);
+
+	goto out;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct inode *inode;
+	int ret = 0;
+
+	root = root->fs_info->tree_root;
+
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+
+	if (block_group->delalloc_bytes) {
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode))
+		return 0;
+
+	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+				      path, block_group->key.objectid);
+	if (ret) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+		ret = 0;
+#ifdef DEBUG
+		btrfs_err(root->fs_info,
+			"failed to write free space cache for block group %llu",
+			block_group->key.objectid);
+#endif
+	}
+
 	iput(inode);
 	return ret;
 }
 
-static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
 					  u64 offset)
 {
-	BUG_ON(offset < bitmap_start);
+	ASSERT(offset >= bitmap_start);
 	offset -= bitmap_start;
-	return (unsigned long)(div64_u64(offset, sectorsize));
+	return (unsigned long)(div_u64(offset, unit));
 }
 
-static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
 {
-	return (unsigned long)(div64_u64(bytes, sectorsize));
+	return (unsigned long)(div_u64(bytes, unit));
 }
 
-static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
 				   u64 offset)
 {
 	u64 bitmap_start;
 	u64 bytes_per_bitmap;
 
-	bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
-	bitmap_start = offset - block_group->key.objectid;
+	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
+	bitmap_start = offset - ctl->start;
 	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
 	bitmap_start *= bytes_per_bitmap;
-	bitmap_start += block_group->key.objectid;
+	bitmap_start += ctl->start;
 
 	return bitmap_start;
 }
@@ -838,10 +1307,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 			 * logically.
 			 */
 			if (bitmap) {
-				WARN_ON(info->bitmap);
+				if (info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
 				p = &(*p)->rb_right;
 			} else {
-				WARN_ON(!info->bitmap);
+				if (!info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
 				p = &(*p)->rb_left;
 			}
 		}
@@ -861,10 +1336,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
  * offset.
  */
 static struct btrfs_free_space *
-tree_search_offset(struct btrfs_block_group_cache *block_group,
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
 		   u64 offset, int bitmap_only, int fuzzy)
 {
-	struct rb_node *n = block_group->free_space_offset.rb_node;
+	struct rb_node *n = ctl->free_space_offset.rb_node;
 	struct btrfs_free_space *entry, *prev = NULL;
 
 	/* find entry that is closest to the 'offset' */
@@ -910,18 +1385,13 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 			 * if previous extent entry covers the offset,
 			 * we should return it instead of the bitmap entry
 			 */
-			n = &entry->offset_index;
-			while (1) {
-				n = rb_prev(n);
-				if (!n)
-					break;
+			n = rb_prev(&entry->offset_index);
+			if (n) {
 				prev = rb_entry(n, struct btrfs_free_space,
 						offset_index);
-				if (!prev->bitmap) {
-					if (prev->offset + prev->bytes > offset)
-						entry = prev;
-					break;
-				}
+				if (!prev->bitmap &&
+				    prev->offset + prev->bytes > offset)
+					entry = prev;
 			}
 		}
 		return entry;
@@ -937,7 +1407,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 		if (n) {
 			entry = rb_entry(n, struct btrfs_free_space,
 					offset_index);
-			BUG_ON(entry->offset > offset);
+			ASSERT(entry->offset <= offset);
 		} else {
 			if (fuzzy)
 				return entry;
@@ -947,21 +1417,15 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 	}
 
 	if (entry->bitmap) {
-		n = &entry->offset_index;
-		while (1) {
-			n = rb_prev(n);
-			if (!n)
-				break;
+		n = rb_prev(&entry->offset_index);
+		if (n) {
 			prev = rb_entry(n, struct btrfs_free_space,
 					offset_index);
-			if (!prev->bitmap) {
-				if (prev->offset + prev->bytes > offset)
-					return prev;
-				break;
-			}
+			if (!prev->bitmap &&
+			    prev->offset + prev->bytes > offset)
+				return prev;
 		}
-		if (entry->offset + BITS_PER_BITMAP *
-		    block_group->sectorsize > offset)
+		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
 			return entry;
 	} else if (entry->offset + entry->bytes > offset)
 		return entry;
@@ -972,7 +1436,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 	while (1) {
 		if (entry->bitmap) {
 			if (entry->offset + BITS_PER_BITMAP *
-			    block_group->sectorsize > offset)
+			    ctl->unit > offset)
 				break;
 		} else {
 			if (entry->offset + entry->bytes > offset)
@@ -988,42 +1452,49 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 }
 
 static inline void
-__unlink_free_space(struct btrfs_block_group_cache *block_group,
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
-	rb_erase(&info->offset_index, &block_group->free_space_offset);
-	block_group->free_extents--;
+	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	ctl->free_extents--;
 }
 
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *info)
 {
-	__unlink_free_space(block_group, info);
-	block_group->free_space -= info->bytes;
+	__unlink_free_space(ctl, info);
+	ctl->free_space -= info->bytes;
 }
 
-static int link_free_space(struct btrfs_block_group_cache *block_group,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info)
 {
 	int ret = 0;
 
-	BUG_ON(!info->bitmap && !info->bytes);
-	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+	ASSERT(info->bytes || info->bitmap);
+	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
 				 &info->offset_index, (info->bitmap != NULL));
 	if (ret)
 		return ret;
 
-	block_group->free_space += info->bytes;
-	block_group->free_extents++;
+	ctl->free_space += info->bytes;
+	ctl->free_extents++;
 	return ret;
 }
 
-static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 {
+	struct btrfs_block_group_cache *block_group = ctl->private;
 	u64 max_bytes;
 	u64 bitmap_bytes;
 	u64 extent_bytes;
 	u64 size = block_group->key.offset;
+	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+
+	max_bitmaps = max(max_bitmaps, 1);
+
+	ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
 	/*
 	 * The goal is to keep the total amount of memory used per 1gb of space
@@ -1041,10 +1512,10 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
 	 * we add more bitmaps.
 	 */
-	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
 
 	if (bitmap_bytes >= max_bytes) {
-		block_group->extents_thresh = 0;
+		ctl->extents_thresh = 0;
 		return;
 	}
 
@@ -1055,144 +1526,180 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 	extent_bytes = max_bytes - bitmap_bytes;
 	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
 
-	block_group->extents_thresh =
+	ctl->extents_thresh =
 		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
 
-static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
-			      struct btrfs_free_space *info, u64 offset,
-			      u64 bytes)
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+				       struct btrfs_free_space *info,
+				       u64 offset, u64 bytes)
 {
-	unsigned long start, end;
-	unsigned long i;
+	unsigned long start, count;
 
-	start = offset_to_bit(info->offset, block_group->sectorsize, offset);
-	end = start + bytes_to_bits(bytes, block_group->sectorsize);
-	BUG_ON(end > BITS_PER_BITMAP);
+	start = offset_to_bit(info->offset, ctl->unit, offset);
+	count = bytes_to_bits(bytes, ctl->unit);
+	ASSERT(start + count <= BITS_PER_BITMAP);
 
-	for (i = start; i < end; i++)
-		clear_bit(i, info->bitmap);
+	bitmap_clear(info->bitmap, start, count);
 
 	info->bytes -= bytes;
-	block_group->free_space -= bytes;
 }
 
-static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info, u64 offset,
+			      u64 bytes)
+{
+	__bitmap_clear_bits(ctl, info, offset, bytes);
+	ctl->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 			    struct btrfs_free_space *info, u64 offset,
 			    u64 bytes)
 {
-	unsigned long start, end;
-	unsigned long i;
+	unsigned long start, count;
 
-	start = offset_to_bit(info->offset, block_group->sectorsize, offset);
-	end = start + bytes_to_bits(bytes, block_group->sectorsize);
-	BUG_ON(end > BITS_PER_BITMAP);
+	start = offset_to_bit(info->offset, ctl->unit, offset);
+	count = bytes_to_bits(bytes, ctl->unit);
+	ASSERT(start + count <= BITS_PER_BITMAP);
 
-	for (i = start; i < end; i++)
-		set_bit(i, info->bitmap);
+	bitmap_set(info->bitmap, start, count);
 
 	info->bytes += bytes;
-	block_group->free_space += bytes;
+	ctl->free_space += bytes;
 }
 
-static int search_bitmap(struct btrfs_block_group_cache *block_group,
+/*
+ * If we can not find suitable extent, we will use bytes to record
+ * the size of the max extent.
+ */
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
 			 u64 *bytes)
 {
 	unsigned long found_bits = 0;
+	unsigned long max_bits = 0;
 	unsigned long bits, i;
 	unsigned long next_zero;
+	unsigned long extent_bits;
 
-	i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+	i = offset_to_bit(bitmap_info->offset, ctl->unit,
 			  max_t(u64, *offset, bitmap_info->offset));
-	bits = bytes_to_bits(*bytes, block_group->sectorsize);
+	bits = bytes_to_bits(*bytes, ctl->unit);
 
-	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
-		if ((next_zero - i) >= bits) {
-			found_bits = next_zero - i;
+		extent_bits = next_zero - i;
+		if (extent_bits >= bits) {
+			found_bits = extent_bits;
 			break;
+		} else if (extent_bits > max_bits) {
+			max_bits = extent_bits;
 		}
 		i = next_zero;
 	}
 
 	if (found_bits) {
-		*offset = (u64)(i * block_group->sectorsize) +
-			bitmap_info->offset;
-		*bytes = (u64)(found_bits) * block_group->sectorsize;
+		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
+		*bytes = (u64)(found_bits) * ctl->unit;
 		return 0;
 	}
 
+	*bytes = (u64)(max_bits) * ctl->unit;
 	return -1;
 }
 
-static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
-						*block_group, u64 *offset,
-						u64 *bytes, int debug)
+/* Cache the size of the max extent in bytes */
+static struct btrfs_free_space *
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+		unsigned long align, u64 *max_extent_size)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
+	u64 tmp;
+	u64 align_off;
 	int ret;
 
-	if (!block_group->free_space_offset.rb_node)
-		return NULL;
+	if (!ctl->free_space_offset.rb_node)
+		goto out;
 
-	entry = tree_search_offset(block_group,
-				   offset_to_bitmap(block_group, *offset),
-				   0, 1);
+	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
 	if (!entry)
-		return NULL;
+		goto out;
 
 	for (node = &entry->offset_index; node; node = rb_next(node)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
-		if (entry->bytes < *bytes)
+		if (entry->bytes < *bytes) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
+			continue;
+		}
+
+		/* make sure the space returned is big enough
+		 * to match our requested alignment
+		 */
+		if (*bytes >= align) {
+			tmp = entry->offset - ctl->start + align - 1;
+			do_div(tmp, align);
+			tmp = tmp * align + ctl->start;
+			align_off = tmp - entry->offset;
+		} else {
+			align_off = 0;
+			tmp = entry->offset;
+		}
+
+		if (entry->bytes < *bytes + align_off) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
 			continue;
+		}
 
 		if (entry->bitmap) {
-			ret = search_bitmap(block_group, entry, offset, bytes);
-			if (!ret)
+			u64 size = *bytes;
+
+			ret = search_bitmap(ctl, entry, &tmp, &size);
+			if (!ret) {
+				*offset = tmp;
+				*bytes = size;
 				return entry;
+			} else if (size > *max_extent_size) {
+				*max_extent_size = size;
+			}
 			continue;
 		}
 
-		*offset = entry->offset;
-		*bytes = entry->bytes;
+		*offset = tmp;
+		*bytes = entry->bytes - align_off;
 		return entry;
 	}
-
+out:
 	return NULL;
 }
 
-static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info, u64 offset)
 {
-	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
-	int max_bitmaps = (int)div64_u64(block_group->key.offset +
-					 bytes_per_bg - 1, bytes_per_bg);
-	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
-
-	info->offset = offset_to_bitmap(block_group, offset);
+	info->offset = offset_to_bitmap(ctl, offset);
 	info->bytes = 0;
-	link_free_space(block_group, info);
-	block_group->total_bitmaps++;
+	INIT_LIST_HEAD(&info->list);
+	link_free_space(ctl, info);
+	ctl->total_bitmaps++;
 
-	recalculate_thresholds(block_group);
+	ctl->op->recalc_thresholds(ctl);
 }
 
-static void free_bitmap(struct btrfs_block_group_cache *block_group,
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
 			struct btrfs_free_space *bitmap_info)
 {
-	unlink_free_space(block_group, bitmap_info);
+	unlink_free_space(ctl, bitmap_info);
 	kfree(bitmap_info->bitmap);
-	kfree(bitmap_info);
-	block_group->total_bitmaps--;
-	recalculate_thresholds(block_group);
+	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+	ctl->total_bitmaps--;
+	ctl->op->recalc_thresholds(ctl);
 }
 
-static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *bitmap_info,
 			      u64 *offset, u64 *bytes)
 {
@@ -1201,40 +1708,35 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
 	int ret;
 
 again:
-	end = bitmap_info->offset +
-		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
 
 	/*
-	 * XXX - this can go away after a few releases.
-	 *
-	 * since the only user of btrfs_remove_free_space is the tree logging
-	 * stuff, and the only way to test that is under crash conditions, we
-	 * want to have this debug stuff here just in case somethings not
-	 * working.  Search the bitmap for the space we are trying to use to
-	 * make sure its actually there.  If its not there then we need to stop
-	 * because something has gone wrong.
+	 * We need to search for bits in this bitmap.  We could only cover some
+	 * of the extent in this bitmap thanks to how we add space, so we need
+	 * to search for as much as it as we can and clear that amount, and then
+	 * go searching for the next bit.
 	 */
 	search_start = *offset;
-	search_bytes = *bytes;
+	search_bytes = ctl->unit;
 	search_bytes = min(search_bytes, end - search_start + 1);
-	ret = search_bitmap(block_group, bitmap_info, &search_start,
-			    &search_bytes);
-	BUG_ON(ret < 0 || search_start != *offset);
+	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+	if (ret < 0 || search_start != *offset)
+		return -EINVAL;
 
-	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
-		bitmap_clear_bits(block_group, bitmap_info, *offset,
-				  end - *offset + 1);
-		*bytes -= end - *offset + 1;
-		*offset = end + 1;
-	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
-		bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
-		*bytes = 0;
-	}
+	/* We may have found more bits than what we need */
+	search_bytes = min(search_bytes, *bytes);
+
+	/* Cannot clear past the end of the bitmap */
+	search_bytes = min(search_bytes, end - search_start + 1);
+
+	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+	*offset += search_bytes;
+	*bytes -= search_bytes;
 
 	if (*bytes) {
 		struct rb_node *next = rb_next(&bitmap_info->offset_index);
 		if (!bitmap_info->bytes)
-			free_bitmap(block_group, bitmap_info);
+			free_bitmap(ctl, bitmap_info);
 
 		/*
 		 * no entry after this bitmap, but we still have bytes to
@@ -1260,70 +1762,149 @@ again:
 		 * everything over again.
 		 */
 		search_start = *offset;
-		search_bytes = *bytes;
-		ret = search_bitmap(block_group, bitmap_info, &search_start,
+		search_bytes = ctl->unit;
+		ret = search_bitmap(ctl, bitmap_info, &search_start,
 				    &search_bytes);
 		if (ret < 0 || search_start != *offset)
 			return -EAGAIN;
 
 		goto again;
 	} else if (!bitmap_info->bytes)
-		free_bitmap(block_group, bitmap_info);
+		free_bitmap(ctl, bitmap_info);
 
 	return 0;
 }
 
-static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
-			      struct btrfs_free_space *info)
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+			       struct btrfs_free_space *info, u64 offset,
+			       u64 bytes)
 {
-	struct btrfs_free_space *bitmap_info;
-	int added = 0;
-	u64 bytes, offset, end;
-	int ret;
+	u64 bytes_to_set = 0;
+	u64 end;
+
+	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+
+	bytes_to_set = min(end - offset, bytes);
+
+	bitmap_set_bits(ctl, info, offset, bytes_to_set);
+
+	return bytes_to_set;
+
+}
+
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+		      struct btrfs_free_space *info)
+{
+	struct btrfs_block_group_cache *block_group = ctl->private;
 
 	/*
 	 * If we are below the extents threshold then we can add this as an
 	 * extent, and don't have to deal with the bitmap
 	 */
-	if (block_group->free_extents < block_group->extents_thresh &&
-	    info->bytes > block_group->sectorsize * 4)
-		return 0;
+	if (ctl->free_extents < ctl->extents_thresh) {
+		/*
+		 * If this block group has some small extents we don't want to
+		 * use up all of our free slots in the cache with them, we want
+		 * to reserve them to larger extents, however if we have plent
+		 * of cache left then go ahead an dadd them, no sense in adding
+		 * the overhead of a bitmap if we don't have to.
+		 */
+		if (info->bytes <= block_group->sectorsize * 4) {
+			if (ctl->free_extents * 2 <= ctl->extents_thresh)
+				return false;
+		} else {
+			return false;
+		}
+	}
 
 	/*
-	 * some block groups are so tiny they can't be enveloped by a bitmap, so
-	 * don't even bother to create a bitmap for this
+	 * The original block groups from mkfs can be really small, like 8
+	 * megabytes, so don't bother with a bitmap for those entries.  However
+	 * some block groups can be smaller than what a bitmap would cover but
+	 * are still large enough that they could overflow the 32k memory limit,
+	 * so allow those block groups to still be allowed to have a bitmap
+	 * entry.
 	 */
-	if (BITS_PER_BITMAP * block_group->sectorsize >
-	    block_group->key.offset)
-		return 0;
+	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_free_space_op free_space_op = {
+	.recalc_thresholds	= recalculate_thresholds,
+	.use_bitmap		= use_bitmap,
+};
+
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	struct btrfs_free_space *bitmap_info;
+	struct btrfs_block_group_cache *block_group = NULL;
+	int added = 0;
+	u64 bytes, offset, bytes_added;
+	int ret;
 
 	bytes = info->bytes;
 	offset = info->offset;
 
+	if (!ctl->op->use_bitmap(ctl, info))
+		return 0;
+
+	if (ctl->op == &free_space_op)
+		block_group = ctl->private;
 again:
-	bitmap_info = tree_search_offset(block_group,
-					 offset_to_bitmap(block_group, offset),
+	/*
+	 * Since we link bitmaps right into the cluster we need to see if we
+	 * have a cluster here, and if so and it has our bitmap we need to add
+	 * the free space to that bitmap.
+	 */
+	if (block_group && !list_empty(&block_group->cluster_list)) {
+		struct btrfs_free_cluster *cluster;
+		struct rb_node *node;
+		struct btrfs_free_space *entry;
+
+		cluster = list_entry(block_group->cluster_list.next,
+				     struct btrfs_free_cluster,
+				     block_group_list);
+		spin_lock(&cluster->lock);
+		node = rb_first(&cluster->root);
+		if (!node) {
+			spin_unlock(&cluster->lock);
+			goto no_cluster_bitmap;
+		}
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (!entry->bitmap) {
+			spin_unlock(&cluster->lock);
+			goto no_cluster_bitmap;
+		}
+
+		if (entry->offset == offset_to_bitmap(ctl, offset)) {
+			bytes_added = add_bytes_to_bitmap(ctl, entry,
+							  offset, bytes);
+			bytes -= bytes_added;
+			offset += bytes_added;
+		}
+		spin_unlock(&cluster->lock);
+		if (!bytes) {
+			ret = 1;
+			goto out;
+		}
+	}
+
+no_cluster_bitmap:
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					 1, 0);
 	if (!bitmap_info) {
-		BUG_ON(added);
+		ASSERT(added == 0);
 		goto new_bitmap;
 	}
 
-	end = bitmap_info->offset +
-		(u64)(BITS_PER_BITMAP * block_group->sectorsize);
-
-	if (offset >= bitmap_info->offset && offset + bytes > end) {
-		bitmap_set_bits(block_group, bitmap_info, offset,
-				end - offset);
-		bytes -= end - offset;
-		offset = end;
-		added = 0;
-	} else if (offset >= bitmap_info->offset && offset + bytes <= end) {
-		bitmap_set_bits(block_group, bitmap_info, offset, bytes);
-		bytes = 0;
-	} else {
-		BUG();
-	}
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	added = 0;
 
 	if (!bytes) {
 		ret = 1;
@@ -1333,19 +1914,19 @@ again:
 
 new_bitmap:
 	if (info && info->bitmap) {
-		add_new_bitmap(block_group, info, offset);
+		add_new_bitmap(ctl, info, offset);
 		added = 1;
 		info = NULL;
 		goto again;
 	} else {
-		spin_unlock(&block_group->tree_lock);
+		spin_unlock(&ctl->tree_lock);
 
 		/* no pre-allocated info, allocate a new one */
 		if (!info) {
-			info = kzalloc(sizeof(struct btrfs_free_space),
-				       GFP_NOFS);
+			info = kmem_cache_zalloc(btrfs_free_space_cachep,
+						 GFP_NOFS);
 			if (!info) {
-				spin_lock(&block_group->tree_lock);
+				spin_lock(&ctl->tree_lock);
 				ret = -ENOMEM;
 				goto out;
 			}
@@ -1353,7 +1934,7 @@ new_bitmap:
 
 		/* allocate the bitmap */
 		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-		spin_lock(&block_group->tree_lock);
+		spin_lock(&ctl->tree_lock);
 		if (!info->bitmap) {
 			ret = -ENOMEM;
 			goto out;
@@ -1365,13 +1946,13 @@ out:
 	if (info) {
 		if (info->bitmap)
 			kfree(info->bitmap);
-		kfree(info);
+		kmem_cache_free(btrfs_free_space_cachep, info);
 	}
 
 	return ret;
 }
 
-bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
 			  struct btrfs_free_space *info, bool update_stat)
 {
 	struct btrfs_free_space *left_info;
@@ -1385,54 +1966,54 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
 	 * are adding, if there is remove that struct and add a new one to
 	 * cover the entire range
 	 */
-	right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
 	if (right_info && rb_prev(&right_info->offset_index))
 		left_info = rb_entry(rb_prev(&right_info->offset_index),
 				     struct btrfs_free_space, offset_index);
 	else
-		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
 
 	if (right_info && !right_info->bitmap) {
 		if (update_stat)
-			unlink_free_space(block_group, right_info);
+			unlink_free_space(ctl, right_info);
 		else
-			__unlink_free_space(block_group, right_info);
+			__unlink_free_space(ctl, right_info);
 		info->bytes += right_info->bytes;
-		kfree(right_info);
+		kmem_cache_free(btrfs_free_space_cachep, right_info);
 		merged = true;
 	}
 
 	if (left_info && !left_info->bitmap &&
 	    left_info->offset + left_info->bytes == offset) {
 		if (update_stat)
-			unlink_free_space(block_group, left_info);
+			unlink_free_space(ctl, left_info);
 		else
-			__unlink_free_space(block_group, left_info);
+			__unlink_free_space(ctl, left_info);
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
-		kfree(left_info);
+		kmem_cache_free(btrfs_free_space_cachep, left_info);
 		merged = true;
 	}
 
 	return merged;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+			   u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
 	int ret = 0;
 
-	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
 	if (!info)
 		return -ENOMEM;
 
 	info->offset = offset;
 	info->bytes = bytes;
 
-	spin_lock(&block_group->tree_lock);
+	spin_lock(&ctl->tree_lock);
 
-	if (try_merge_free_space(block_group, info, true))
+	if (try_merge_free_space(ctl, info, true))
 		goto link;
 
 	/*
@@ -1440,7 +2021,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	 * extent then we know we're going to have to allocate a new extent, so
 	 * before we do that see if we need to drop this into a bitmap
 	 */
-	ret = insert_into_bitmap(block_group, info);
+	ret = insert_into_bitmap(ctl, info);
 	if (ret < 0) {
 		goto out;
 	} else if (ret) {
@@ -1448,15 +2029,15 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		goto out;
 	}
 link:
-	ret = link_free_space(block_group, info);
+	ret = link_free_space(ctl, info);
 	if (ret)
-		kfree(info);
+		kmem_cache_free(btrfs_free_space_cachep, info);
 out:
-	spin_unlock(&block_group->tree_lock);
+	spin_unlock(&ctl->tree_lock);
 
 	if (ret) {
-		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
-		BUG_ON(ret == -EEXIST);
+		printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
+		ASSERT(ret != -EEXIST);
 	}
 
 	return ret;
@@ -1465,116 +2046,89 @@ out:
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 offset, u64 bytes)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *info;
-	struct btrfs_free_space *next_info = NULL;
-	int ret = 0;
+	int ret;
+	bool re_search = false;
 
-	spin_lock(&block_group->tree_lock);
+	spin_lock(&ctl->tree_lock);
 
 again:
-	info = tree_search_offset(block_group, offset, 0, 0);
+	ret = 0;
+	if (!bytes)
+		goto out_lock;
+
+	info = tree_search_offset(ctl, offset, 0, 0);
 	if (!info) {
 		/*
 		 * oops didn't find an extent that matched the space we wanted
 		 * to remove, look for a bitmap instead
 		 */
-		info = tree_search_offset(block_group,
-					  offset_to_bitmap(block_group, offset),
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					  1, 0);
 		if (!info) {
-			WARN_ON(1);
-			goto out_lock;
-		}
-	}
-
-	if (info->bytes < bytes && rb_next(&info->offset_index)) {
-		u64 end;
-		next_info = rb_entry(rb_next(&info->offset_index),
-					     struct btrfs_free_space,
-					     offset_index);
-
-		if (next_info->bitmap)
-			end = next_info->offset + BITS_PER_BITMAP *
-				block_group->sectorsize - 1;
-		else
-			end = next_info->offset + next_info->bytes;
-
-		if (next_info->bytes < bytes ||
-		    next_info->offset > offset || offset > end) {
-			printk(KERN_CRIT "Found free space at %llu, size %llu,"
-			      " trying to use %llu\n",
-			      (unsigned long long)info->offset,
-			      (unsigned long long)info->bytes,
-			      (unsigned long long)bytes);
-			WARN_ON(1);
-			ret = -EINVAL;
+			/*
+			 * If we found a partial bit of our free space in a
+			 * bitmap but then couldn't find the other part this may
+			 * be a problem, so WARN about it.
+			 */
+			WARN_ON(re_search);
 			goto out_lock;
 		}
-
-		info = next_info;
 	}
 
-	if (info->bytes == bytes) {
-		unlink_free_space(block_group, info);
-		if (info->bitmap) {
-			kfree(info->bitmap);
-			block_group->total_bitmaps--;
-		}
-		kfree(info);
-		goto out_lock;
-	}
+	re_search = false;
+	if (!info->bitmap) {
+		unlink_free_space(ctl, info);
+		if (offset == info->offset) {
+			u64 to_free = min(bytes, info->bytes);
 
-	if (!info->bitmap && info->offset == offset) {
-		unlink_free_space(block_group, info);
-		info->offset += bytes;
-		info->bytes -= bytes;
-		link_free_space(block_group, info);
-		goto out_lock;
-	}
+			info->bytes -= to_free;
+			info->offset += to_free;
+			if (info->bytes) {
+				ret = link_free_space(ctl, info);
+				WARN_ON(ret);
+			} else {
+				kmem_cache_free(btrfs_free_space_cachep, info);
+			}
 
-	if (!info->bitmap && info->offset <= offset &&
-	    info->offset + info->bytes >= offset + bytes) {
-		u64 old_start = info->offset;
-		/*
-		 * we're freeing space in the middle of the info,
-		 * this can happen during tree log replay
-		 *
-		 * first unlink the old info and then
-		 * insert it again after the hole we're creating
-		 */
-		unlink_free_space(block_group, info);
-		if (offset + bytes < info->offset + info->bytes) {
-			u64 old_end = info->offset + info->bytes;
+			offset += to_free;
+			bytes -= to_free;
+			goto again;
+		} else {
+			u64 old_end = info->bytes + info->offset;
 
-			info->offset = offset + bytes;
-			info->bytes = old_end - info->offset;
-			ret = link_free_space(block_group, info);
+			info->bytes = offset - info->offset;
+			ret = link_free_space(ctl, info);
 			WARN_ON(ret);
 			if (ret)
 				goto out_lock;
-		} else {
-			/* the hole we're creating ends at the end
-			 * of the info struct, just free the info
-			 */
-			kfree(info);
-		}
-		spin_unlock(&block_group->tree_lock);
 
-		/* step two, insert a new info struct to cover
-		 * anything before the hole
-		 */
-		ret = btrfs_add_free_space(block_group, old_start,
-					   offset - old_start);
-		WARN_ON(ret);
-		goto out;
+			/* Not enough bytes in this entry to satisfy us */
+			if (old_end < offset + bytes) {
+				bytes -= old_end - offset;
+				offset = old_end;
+				goto again;
+			} else if (old_end == offset + bytes) {
+				/* all done */
+				goto out_lock;
+			}
+			spin_unlock(&ctl->tree_lock);
+
+			ret = btrfs_add_free_space(block_group, offset + bytes,
+						   old_end - (offset + bytes));
+			WARN_ON(ret);
+			goto out;
+		}
 	}
 
-	ret = remove_from_bitmap(block_group, info, &offset, &bytes);
-	if (ret == -EAGAIN)
+	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
+	if (ret == -EAGAIN) {
+		re_search = true;
 		goto again;
-	BUG_ON(ret);
+	}
 out_lock:
-	spin_unlock(&block_group->tree_lock);
+	spin_unlock(&ctl->tree_lock);
 out:
 	return ret;
 }
@@ -1582,38 +2136,43 @@ out:
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *info;
 	struct rb_node *n;
 	int count = 0;
 
-	for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
-		if (info->bytes >= bytes)
+		if (info->bytes >= bytes && !block_group->ro)
 			count++;
-		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
-		       (unsigned long long)info->offset,
-		       (unsigned long long)info->bytes,
+		btrfs_crit(block_group->fs_info,
+			   "entry offset %llu, bytes %llu, bitmap %s",
+			   info->offset, info->bytes,
 		       (info->bitmap) ? "yes" : "no");
 	}
-	printk(KERN_INFO "block group has cluster?: %s\n",
+	btrfs_info(block_group->fs_info, "block group has cluster?: %s",
 	       list_empty(&block_group->cluster_list) ? "no" : "yes");
-	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
-	       "\n", count);
+	btrfs_info(block_group->fs_info,
+		   "%d blocks of free space at or bigger than bytes is", count);
 }
 
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
 {
-	struct btrfs_free_space *info;
-	struct rb_node *n;
-	u64 ret = 0;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 
-	for (n = rb_first(&block_group->free_space_offset); n;
-	     n = rb_next(n)) {
-		info = rb_entry(n, struct btrfs_free_space, offset_index);
-		ret += info->bytes;
-	}
+	spin_lock_init(&ctl->tree_lock);
+	ctl->unit = block_group->sectorsize;
+	ctl->start = block_group->key.objectid;
+	ctl->private = block_group;
+	ctl->op = &free_space_op;
 
-	return ret;
+	/*
+	 * we only want to have 32k of ram per block group for keeping
+	 * track of free space, and if we pass 1/2 of that we want to
+	 * start converting things over to using bitmaps
+	 */
+	ctl->extents_thresh = ((1024 * 32) / 2) /
+				sizeof(struct btrfs_free_space);
 }
 
 /*
@@ -1627,32 +2186,31 @@ __btrfs_return_cluster_to_free_space(
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
-	bool bitmap;
 
 	spin_lock(&cluster->lock);
 	if (cluster->block_group != block_group)
 		goto out;
 
-	bitmap = cluster->points_to_bitmap;
 	cluster->block_group = NULL;
 	cluster->window_start = 0;
 	list_del_init(&cluster->block_group_list);
-	cluster->points_to_bitmap = false;
-
-	if (bitmap)
-		goto out;
 
 	node = rb_first(&cluster->root);
 	while (node) {
+		bool bitmap;
+
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
 		rb_erase(&entry->offset_index, &cluster->root);
-		BUG_ON(entry->bitmap);
-		try_merge_free_space(block_group, entry, false);
-		tree_insert_offset(&block_group->free_space_offset,
-				   entry->offset, &entry->offset_index, 0);
+
+		bitmap = (entry->bitmap != NULL);
+		if (!bitmap)
+			try_merge_free_space(ctl, entry, false);
+		tree_insert_offset(&ctl->free_space_offset,
+				   entry->offset, &entry->offset_index, bitmap);
 	}
 	cluster->root = RB_ROOT;
 
@@ -1662,14 +2220,42 @@ out:
 	return 0;
 }
 
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+static void __btrfs_remove_free_space_cache_locked(
+				struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_free_space *info;
 	struct rb_node *node;
+
+	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (!info->bitmap) {
+			unlink_free_space(ctl, info);
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		} else {
+			free_bitmap(ctl, info);
+		}
+		if (need_resched()) {
+			spin_unlock(&ctl->tree_lock);
+			cond_resched();
+			spin_lock(&ctl->tree_lock);
+		}
+	}
+}
+
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+	spin_lock(&ctl->tree_lock);
+	__btrfs_remove_free_space_cache_locked(ctl);
+	spin_unlock(&ctl->tree_lock);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_cluster *cluster;
 	struct list_head *head;
 
-	spin_lock(&block_group->tree_lock);
+	spin_lock(&ctl->tree_lock);
 	while ((head = block_group->cluster_list.next) !=
 	       &block_group->cluster_list) {
 		cluster = list_entry(head, struct btrfs_free_cluster,
@@ -1678,58 +2264,57 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 		WARN_ON(cluster->block_group != block_group);
 		__btrfs_return_cluster_to_free_space(block_group, cluster);
 		if (need_resched()) {
-			spin_unlock(&block_group->tree_lock);
-			cond_resched();
-			spin_lock(&block_group->tree_lock);
-		}
-	}
-
-	while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-		info = rb_entry(node, struct btrfs_free_space, offset_index);
-		unlink_free_space(block_group, info);
-		if (info->bitmap)
-			kfree(info->bitmap);
-		kfree(info);
-		if (need_resched()) {
-			spin_unlock(&block_group->tree_lock);
+			spin_unlock(&ctl->tree_lock);
 			cond_resched();
-			spin_lock(&block_group->tree_lock);
+			spin_lock(&ctl->tree_lock);
 		}
 	}
+	__btrfs_remove_free_space_cache_locked(ctl);
+	spin_unlock(&ctl->tree_lock);
 
-	spin_unlock(&block_group->tree_lock);
 }
 
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-			       u64 offset, u64 bytes, u64 empty_size)
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry = NULL;
 	u64 bytes_search = bytes + empty_size;
 	u64 ret = 0;
+	u64 align_gap = 0;
+	u64 align_gap_len = 0;
 
-	spin_lock(&block_group->tree_lock);
-	entry = find_free_space(block_group, &offset, &bytes_search, 0);
+	spin_lock(&ctl->tree_lock);
+	entry = find_free_space(ctl, &offset, &bytes_search,
+				block_group->full_stripe_len, max_extent_size);
 	if (!entry)
 		goto out;
 
 	ret = offset;
 	if (entry->bitmap) {
-		bitmap_clear_bits(block_group, entry, offset, bytes);
+		bitmap_clear_bits(ctl, entry, offset, bytes);
 		if (!entry->bytes)
-			free_bitmap(block_group, entry);
+			free_bitmap(ctl, entry);
 	} else {
-		unlink_free_space(block_group, entry);
-		entry->offset += bytes;
-		entry->bytes -= bytes;
+		unlink_free_space(ctl, entry);
+		align_gap_len = offset - entry->offset;
+		align_gap = entry->offset;
+
+		entry->offset = offset + bytes;
+		WARN_ON(entry->bytes < bytes + align_gap_len);
+
+		entry->bytes -= bytes + align_gap_len;
 		if (!entry->bytes)
-			kfree(entry);
+			kmem_cache_free(btrfs_free_space_cachep, entry);
 		else
-			link_free_space(block_group, entry);
+			link_free_space(ctl, entry);
 	}
-
 out:
-	spin_unlock(&block_group->tree_lock);
+	spin_unlock(&ctl->tree_lock);
 
+	if (align_gap_len)
+		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
 	return ret;
 }
 
@@ -1745,6 +2330,7 @@ int btrfs_return_cluster_to_free_space(
 			       struct btrfs_block_group_cache *block_group,
 			       struct btrfs_free_cluster *cluster)
 {
+	struct btrfs_free_space_ctl *ctl;
 	int ret;
 
 	/* first, get a safe pointer to the block group */
@@ -1763,10 +2349,12 @@ int btrfs_return_cluster_to_free_space(
 	atomic_inc(&block_group->count);
 	spin_unlock(&cluster->lock);
 
+	ctl = block_group->free_space_ctl;
+
 	/* now return any extents the cluster had on it */
-	spin_lock(&block_group->tree_lock);
+	spin_lock(&ctl->tree_lock);
 	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
-	spin_unlock(&block_group->tree_lock);
+	spin_unlock(&ctl->tree_lock);
 
 	/* finally drop our ref */
 	btrfs_put_block_group(block_group);
@@ -1775,50 +2363,28 @@ int btrfs_return_cluster_to_free_space(
 
 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 				   struct btrfs_free_cluster *cluster,
-				   u64 bytes, u64 min_start)
+				   struct btrfs_free_space *entry,
+				   u64 bytes, u64 min_start,
+				   u64 *max_extent_size)
 {
-	struct btrfs_free_space *entry;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	int err;
 	u64 search_start = cluster->window_start;
 	u64 search_bytes = bytes;
 	u64 ret = 0;
 
-	spin_lock(&block_group->tree_lock);
-	spin_lock(&cluster->lock);
-
-	if (!cluster->points_to_bitmap)
-		goto out;
-
-	if (cluster->block_group != block_group)
-		goto out;
-
-	/*
-	 * search_start is the beginning of the bitmap, but at some point it may
-	 * be a good idea to point to the actual start of the free area in the
-	 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
-	 * to 1 to make sure we get the bitmap entry
-	 */
-	entry = tree_search_offset(block_group,
-				   offset_to_bitmap(block_group, search_start),
-				   1, 0);
-	if (!entry || !entry->bitmap)
-		goto out;
-
 	search_start = min_start;
 	search_bytes = bytes;
 
-	err = search_bitmap(block_group, entry, &search_start,
-			    &search_bytes);
-	if (err)
-		goto out;
+	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+	if (err) {
+		if (search_bytes > *max_extent_size)
+			*max_extent_size = search_bytes;
+		return 0;
+	}
 
 	ret = search_start;
-	bitmap_clear_bits(block_group, entry, ret, bytes);
-	if (entry->bytes == 0)
-		free_bitmap(block_group, entry);
-out:
-	spin_unlock(&cluster->lock);
-	spin_unlock(&block_group->tree_lock);
+	__bitmap_clear_bits(ctl, entry, ret, bytes);
 
 	return ret;
 }
@@ -1830,16 +2396,13 @@ out:
  */
 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster, u64 bytes,
-			     u64 min_start)
+			     u64 min_start, u64 *max_extent_size)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry = NULL;
 	struct rb_node *node;
 	u64 ret = 0;
 
-	if (cluster->points_to_bitmap)
-		return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
-					       min_start);
-
 	spin_lock(&cluster->lock);
 	if (bytes > cluster->max_size)
 		goto out;
@@ -1852,11 +2415,12 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		goto out;
 
 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	while (1) {
+		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
+			*max_extent_size = entry->bytes;
 
-	while(1) {
-		if (entry->bytes < bytes || entry->offset < min_start) {
-			struct rb_node *node;
-
+		if (entry->bytes < bytes ||
+		    (!entry->bitmap && entry->offset < min_start)) {
 			node = rb_next(&entry->offset_index);
 			if (!node)
 				break;
@@ -1864,10 +2428,27 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 					 offset_index);
 			continue;
 		}
-		ret = entry->offset;
 
-		entry->offset += bytes;
-		entry->bytes -= bytes;
+		if (entry->bitmap) {
+			ret = btrfs_alloc_from_bitmap(block_group,
+						      cluster, entry, bytes,
+						      cluster->window_start,
+						      max_extent_size);
+			if (ret == 0) {
+				node = rb_next(&entry->offset_index);
+				if (!node)
+					break;
+				entry = rb_entry(node, struct btrfs_free_space,
+						 offset_index);
+				continue;
+			}
+			cluster->window_start += bytes;
+		} else {
+			ret = entry->offset;
+
+			entry->offset += bytes;
+			entry->bytes -= bytes;
+		}
 
 		if (entry->bytes == 0)
 			rb_erase(&entry->offset_index, &cluster->root);
@@ -1879,15 +2460,20 @@ out:
 	if (!ret)
 		return 0;
 
-	spin_lock(&block_group->tree_lock);
+	spin_lock(&ctl->tree_lock);
 
-	block_group->free_space -= bytes;
+	ctl->free_space -= bytes;
 	if (entry->bytes == 0) {
-		block_group->free_extents--;
-		kfree(entry);
+		ctl->free_extents--;
+		if (entry->bitmap) {
+			kfree(entry->bitmap);
+			ctl->total_bitmaps--;
+			ctl->op->recalc_thresholds(ctl);
+		}
+		kmem_cache_free(btrfs_free_space_cachep, entry);
 	}
 
-	spin_unlock(&block_group->tree_lock);
+	spin_unlock(&ctl->tree_lock);
 
 	return ret;
 }
@@ -1895,30 +2481,30 @@ out:
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 				struct btrfs_free_space *entry,
 				struct btrfs_free_cluster *cluster,
-				u64 offset, u64 bytes, u64 min_bytes)
+				u64 offset, u64 bytes,
+				u64 cont1_bytes, u64 min_bytes)
 {
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	unsigned long next_zero;
 	unsigned long i;
-	unsigned long search_bits;
-	unsigned long total_bits;
+	unsigned long want_bits;
+	unsigned long min_bits;
 	unsigned long found_bits;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
-	bool found = false;
+	int ret;
 
-	i = offset_to_bit(entry->offset, block_group->sectorsize,
+	i = offset_to_bit(entry->offset, ctl->unit,
 			  max_t(u64, offset, entry->offset));
-	search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
-	total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+	want_bits = bytes_to_bits(bytes, ctl->unit);
+	min_bits = bytes_to_bits(min_bytes, ctl->unit);
 
 again:
 	found_bits = 0;
-	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
-		if (next_zero - i >= search_bits) {
+		if (next_zero - i >= min_bits) {
 			found_bits = next_zero - i;
 			break;
 		}
@@ -1926,77 +2512,218 @@ again:
 	}
 
 	if (!found_bits)
-		return -1;
+		return -ENOSPC;
 
-	if (!found) {
+	if (!total_found) {
 		start = i;
-		found = true;
+		cluster->max_size = 0;
 	}
 
 	total_found += found_bits;
 
-	if (cluster->max_size < found_bits * block_group->sectorsize)
-		cluster->max_size = found_bits * block_group->sectorsize;
+	if (cluster->max_size < found_bits * ctl->unit)
+		cluster->max_size = found_bits * ctl->unit;
 
-	if (total_found < total_bits) {
-		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-		if (i - start > total_bits * 2) {
-			total_found = 0;
-			cluster->max_size = 0;
-			found = false;
-		}
+	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+		i = next_zero + 1;
 		goto again;
 	}
 
-	cluster->window_start = start * block_group->sectorsize +
-		entry->offset;
-	cluster->points_to_bitmap = true;
+	cluster->window_start = start * ctl->unit + entry->offset;
+	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	ret = tree_insert_offset(&cluster->root, entry->offset,
+				 &entry->offset_index, 1);
+	ASSERT(!ret); /* -EEXIST; Logic error */
+
+	trace_btrfs_setup_cluster(block_group, cluster,
+				  total_found * ctl->unit, 1);
+	return 0;
+}
+
+/*
+ * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
+ */
+static noinline int
+setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+			struct btrfs_free_cluster *cluster,
+			struct list_head *bitmaps, u64 offset, u64 bytes,
+			u64 cont1_bytes, u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *first = NULL;
+	struct btrfs_free_space *entry = NULL;
+	struct btrfs_free_space *last;
+	struct rb_node *node;
+	u64 window_free;
+	u64 max_extent;
+	u64 total_size = 0;
+
+	entry = tree_search_offset(ctl, offset, 0, 1);
+	if (!entry)
+		return -ENOSPC;
+
+	/*
+	 * We don't want bitmaps, so just move along until we find a normal
+	 * extent entry.
+	 */
+	while (entry->bitmap || entry->bytes < min_bytes) {
+		if (entry->bitmap && list_empty(&entry->list))
+			list_add_tail(&entry->list, bitmaps);
+		node = rb_next(&entry->offset_index);
+		if (!node)
+			return -ENOSPC;
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	}
+
+	window_free = entry->bytes;
+	max_extent = entry->bytes;
+	first = entry;
+	last = entry;
+
+	for (node = rb_next(&entry->offset_index); node;
+	     node = rb_next(&entry->offset_index)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+		if (entry->bitmap) {
+			if (list_empty(&entry->list))
+				list_add_tail(&entry->list, bitmaps);
+			continue;
+		}
+
+		if (entry->bytes < min_bytes)
+			continue;
+
+		last = entry;
+		window_free += entry->bytes;
+		if (entry->bytes > max_extent)
+			max_extent = entry->bytes;
+	}
+
+	if (window_free < bytes || max_extent < cont1_bytes)
+		return -ENOSPC;
+
+	cluster->window_start = first->offset;
+
+	node = &first->offset_index;
+
+	/*
+	 * now we've found our entries, pull them out of the free space
+	 * cache and put them into the cluster rbtree
+	 */
+	do {
+		int ret;
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		if (entry->bitmap || entry->bytes < min_bytes)
+			continue;
+
+		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		ret = tree_insert_offset(&cluster->root, entry->offset,
+					 &entry->offset_index, 0);
+		total_size += entry->bytes;
+		ASSERT(!ret); /* -EEXIST; Logic error */
+	} while (node && entry != last);
 
+	cluster->max_size = max_extent;
+	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
 	return 0;
 }
 
 /*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static noinline int
+setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+		     struct btrfs_free_cluster *cluster,
+		     struct list_head *bitmaps, u64 offset, u64 bytes,
+		     u64 cont1_bytes, u64 min_bytes)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = -ENOSPC;
+	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
+
+	if (ctl->total_bitmaps == 0)
+		return -ENOSPC;
+
+	/*
+	 * The bitmap that covers offset won't be in the list unless offset
+	 * is just its start offset.
+	 */
+	entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+	if (entry->offset != bitmap_offset) {
+		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+		if (entry && list_empty(&entry->list))
+			list_add(&entry->list, bitmaps);
+	}
+
+	list_for_each_entry(entry, bitmaps, list) {
+		if (entry->bytes < bytes)
+			continue;
+		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+					   bytes, cont1_bytes, min_bytes);
+		if (!ret)
+			return 0;
+	}
+
+	/*
+	 * The bitmaps list has all the bitmaps that record free space
+	 * starting after offset, so no more search is required.
+	 */
+	return -ENOSPC;
+}
+
+/*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
  * it returns -enospc
  */
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
 {
-	struct btrfs_free_space *entry = NULL;
-	struct rb_node *node;
-	struct btrfs_free_space *next;
-	struct btrfs_free_space *last = NULL;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry, *tmp;
+	LIST_HEAD(bitmaps);
 	u64 min_bytes;
-	u64 window_start;
-	u64 window_free;
-	u64 max_extent = 0;
-	bool found_bitmap = false;
+	u64 cont1_bytes;
 	int ret;
 
-	/* for metadata, allow allocates with more holes */
+	/*
+	 * Choose the minimum extent size we'll require for this
+	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+	 * For metadata, allow allocates with smaller extents.  For
+	 * data, keep it dense.
+	 */
 	if (btrfs_test_opt(root, SSD_SPREAD)) {
-		min_bytes = bytes + empty_size;
+		cont1_bytes = min_bytes = bytes + empty_size;
 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-		/*
-		 * we want to do larger allocations when we are
-		 * flushing out the delayed refs, it helps prevent
-		 * making more work as we go along.
-		 */
-		if (trans->transaction->delayed_refs.flushing)
-			min_bytes = max(bytes, (bytes + empty_size) >> 1);
-		else
-			min_bytes = max(bytes, (bytes + empty_size) >> 4);
-	} else
-		min_bytes = max(bytes, (bytes + empty_size) >> 2);
+		cont1_bytes = bytes;
+		min_bytes = block_group->sectorsize;
+	} else {
+		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+		min_bytes = block_group->sectorsize;
+	}
+
+	spin_lock(&ctl->tree_lock);
+
+	/*
+	 * If we know we don't have enough space to make a cluster don't even
+	 * bother doing all the work to try and find one.
+	 */
+	if (ctl->free_space < bytes) {
+		spin_unlock(&ctl->tree_lock);
+		return -ENOSPC;
+	}
 
-	spin_lock(&block_group->tree_lock);
 	spin_lock(&cluster->lock);
 
 	/* someone already found a cluster, hooray */
@@ -2004,153 +2731,547 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		ret = 0;
 		goto out;
 	}
-again:
-	entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-	if (!entry) {
-		ret = -ENOSPC;
-		goto out;
+
+	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+				 min_bytes);
+
+	INIT_LIST_HEAD(&bitmaps);
+	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+				      bytes + empty_size,
+				      cont1_bytes, min_bytes);
+	if (ret)
+		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
+					   offset, bytes + empty_size,
+					   cont1_bytes, min_bytes);
+
+	/* Clear our temporary list */
+	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+		list_del_init(&entry->list);
+
+	if (!ret) {
+		atomic_inc(&block_group->count);
+		list_add_tail(&cluster->block_group_list,
+			      &block_group->cluster_list);
+		cluster->block_group = block_group;
+	} else {
+		trace_btrfs_failed_cluster_setup(block_group);
 	}
+out:
+	spin_unlock(&cluster->lock);
+	spin_unlock(&ctl->tree_lock);
 
-	/*
-	 * If found_bitmap is true, we exhausted our search for extent entries,
-	 * and we just want to search all of the bitmaps that we can find, and
-	 * ignore any extent entries we find.
-	 */
-	while (entry->bitmap || found_bitmap ||
-	       (!entry->bitmap && entry->bytes < min_bytes)) {
-		struct rb_node *node = rb_next(&entry->offset_index);
-
-		if (entry->bitmap && entry->bytes > bytes + empty_size) {
-			ret = btrfs_bitmap_cluster(block_group, entry, cluster,
-						   offset, bytes + empty_size,
-						   min_bytes);
-			if (!ret)
-				goto got_it;
-		}
+	return ret;
+}
 
-		if (!node) {
-			ret = -ENOSPC;
-			goto out;
-		}
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+	spin_lock_init(&cluster->lock);
+	spin_lock_init(&cluster->refill_lock);
+	cluster->root = RB_ROOT;
+	cluster->max_size = 0;
+	INIT_LIST_HEAD(&cluster->block_group_list);
+	cluster->block_group = NULL;
+}
+
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+		       u64 *total_trimmed, u64 start, u64 bytes,
+		       u64 reserved_start, u64 reserved_bytes)
+{
+	struct btrfs_space_info *space_info = block_group->space_info;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	int ret;
+	int update = 0;
+	u64 trimmed = 0;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&block_group->lock);
+	if (!block_group->ro) {
+		block_group->reserved += reserved_bytes;
+		space_info->bytes_reserved += reserved_bytes;
+		update = 1;
 	}
+	spin_unlock(&block_group->lock);
+	spin_unlock(&space_info->lock);
 
-	/*
-	 * We already searched all the extent entries from the passed in offset
-	 * to the end and didn't find enough space for the cluster, and we also
-	 * didn't find any bitmaps that met our criteria, just go ahead and exit
-	 */
-	if (found_bitmap) {
-		ret = -ENOSPC;
-		goto out;
+	ret = btrfs_error_discard_extent(fs_info->extent_root,
+					 start, bytes, &trimmed);
+	if (!ret)
+		*total_trimmed += trimmed;
+
+	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+	if (update) {
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+		if (block_group->ro)
+			space_info->bytes_readonly += reserved_bytes;
+		block_group->reserved -= reserved_bytes;
+		space_info->bytes_reserved -= reserved_bytes;
+		spin_unlock(&space_info->lock);
+		spin_unlock(&block_group->lock);
 	}
 
-	cluster->points_to_bitmap = false;
-	window_start = entry->offset;
-	window_free = entry->bytes;
-	last = entry;
-	max_extent = entry->bytes;
+	return ret;
+}
 
-	while (1) {
-		/* out window is just right, lets fill it */
-		if (window_free >= bytes + empty_size)
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret = 0;
+	u64 extent_start;
+	u64 extent_bytes;
+	u64 bytes;
+
+	while (start < end) {
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
 			break;
+		}
 
-		node = rb_next(&last->offset_index);
-		if (!node) {
-			if (found_bitmap)
-				goto again;
-			ret = -ENOSPC;
-			goto out;
+		entry = tree_search_offset(ctl, start, 0, 1);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			break;
 		}
-		next = rb_entry(node, struct btrfs_free_space, offset_index);
 
-		/*
-		 * we found a bitmap, so if this search doesn't result in a
-		 * cluster, we know to go and search again for the bitmaps and
-		 * start looking for space there
-		 */
-		if (next->bitmap) {
-			if (!found_bitmap)
-				offset = next->offset;
-			found_bitmap = true;
-			last = next;
-			continue;
+		/* skip bitmaps */
+		while (entry->bitmap) {
+			node = rb_next(&entry->offset_index);
+			if (!node) {
+				spin_unlock(&ctl->tree_lock);
+				goto out;
+			}
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
 		}
 
-		/*
-		 * we haven't filled the empty size and the window is
-		 * very large.  reset and try again
-		 */
-		if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
-		    next->offset - window_start > (bytes + empty_size) * 2) {
-			entry = next;
-			window_start = entry->offset;
-			window_free = entry->bytes;
-			last = entry;
-			max_extent = entry->bytes;
-		} else {
-			last = next;
-			window_free += next->bytes;
-			if (entry->bytes > max_extent)
-				max_extent = entry->bytes;
+		if (entry->offset >= end) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		extent_start = entry->offset;
+		extent_bytes = entry->bytes;
+		start = max(start, extent_start);
+		bytes = min(extent_start + extent_bytes, end) - start;
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
+		}
+
+		unlink_free_space(ctl, entry);
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+
+		spin_unlock(&ctl->tree_lock);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  extent_start, extent_bytes);
+		if (ret)
+			break;
+next:
+		start += bytes;
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
 		}
+
+		cond_resched();
 	}
+out:
+	return ret;
+}
 
-	cluster->window_start = entry->offset;
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = 0;
+	int ret2;
+	u64 bytes;
+	u64 offset = offset_to_bitmap(ctl, start);
 
-	/*
-	 * now we've found our entries, pull them out of the free space
-	 * cache and put them into the cluster rbtree
-	 *
-	 * The cluster includes an rbtree, but only uses the offset index
-	 * of each free space cache entry.
-	 */
-	while (1) {
-		node = rb_next(&entry->offset_index);
-		if (entry->bitmap && node) {
-			entry = rb_entry(node, struct btrfs_free_space,
-					 offset_index);
-			continue;
-		} else if (entry->bitmap && !node) {
+	while (offset < end) {
+		bool next_bitmap = false;
+
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
 			break;
 		}
 
-		rb_erase(&entry->offset_index, &block_group->free_space_offset);
-		ret = tree_insert_offset(&cluster->root, entry->offset,
-					 &entry->offset_index, 0);
-		BUG_ON(ret);
+		entry = tree_search_offset(ctl, offset, 1, 0);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = minlen;
+		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		if (ret2 || start >= end) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = min(bytes, end - start);
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
+		}
+
+		bitmap_clear_bits(ctl, entry, start, bytes);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+
+		spin_unlock(&ctl->tree_lock);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  start, bytes);
+		if (ret)
+			break;
+next:
+		if (next_bitmap) {
+			offset += BITS_PER_BITMAP * ctl->unit;
+		} else {
+			start += bytes;
+			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+				offset += BITS_PER_BITMAP * ctl->unit;
+		}
 
-		if (!node || entry == last)
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
 			break;
+		}
 
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		cond_resched();
 	}
 
-	cluster->max_size = max_extent;
-got_it:
-	ret = 0;
-	atomic_inc(&block_group->count);
-	list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
-	cluster->block_group = block_group;
+	return ret;
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	int ret;
+
+	*trimmed = 0;
+
+	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+	if (ret)
+		return ret;
+
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+	return ret;
+}
+
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+	struct btrfs_free_space *entry = NULL;
+	u64 ino = 0;
+
+	spin_lock(&ctl->tree_lock);
+
+	if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+		goto out;
+
+	entry = rb_entry(rb_first(&ctl->free_space_offset),
+			 struct btrfs_free_space, offset_index);
+
+	if (!entry->bitmap) {
+		ino = entry->offset;
+
+		unlink_free_space(ctl, entry);
+		entry->offset++;
+		entry->bytes--;
+		if (!entry->bytes)
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		else
+			link_free_space(ctl, entry);
+	} else {
+		u64 offset = 0;
+		u64 count = 1;
+		int ret;
+
+		ret = search_bitmap(ctl, entry, &offset, &count);
+		/* Logic error; Should be empty if it can't find anything */
+		ASSERT(!ret);
+
+		ino = offset;
+		bitmap_clear_bits(ctl, entry, offset, 1);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+	}
 out:
-	spin_unlock(&cluster->lock);
-	spin_unlock(&block_group->tree_lock);
+	spin_unlock(&ctl->tree_lock);
+
+	return ino;
+}
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+				    struct btrfs_path *path)
+{
+	struct inode *inode = NULL;
+
+	spin_lock(&root->cache_lock);
+	if (root->cache_inode)
+		inode = igrab(root->cache_inode);
+	spin_unlock(&root->cache_lock);
+	if (inode)
+		return inode;
+
+	inode = __lookup_free_space_inode(root, path, 0);
+	if (IS_ERR(inode))
+		return inode;
+
+	spin_lock(&root->cache_lock);
+	if (!btrfs_fs_closing(root->fs_info))
+		root->cache_inode = igrab(inode);
+	spin_unlock(&root->cache_lock);
+
+	return inode;
+}
+
+int create_free_ino_inode(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_path *path)
+{
+	return __create_free_space_inode(root, trans, path,
+					 BTRFS_FREE_INO_OBJECTID, 0);
+}
+
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_path *path;
+	struct inode *inode;
+	int ret = 0;
+	u64 root_gen = btrfs_root_generation(&root->root_item);
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	/*
+	 * If we're unmounting then just return, since this does a search on the
+	 * normal root and not the commit root and we could deadlock.
+	 */
+	if (btrfs_fs_closing(fs_info))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode))
+		goto out;
+
+	if (root_gen != BTRFS_I(inode)->generation)
+		goto out_put;
+
+	ret = __load_free_space_cache(root, inode, ctl, path, 0);
+
+	if (ret < 0)
+		btrfs_err(fs_info,
+			"failed to load free ino cache for root %llu",
+			root->root_key.objectid);
+out_put:
+	iput(inode);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path,
+			      struct inode *inode)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	int ret;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+	if (ret) {
+		btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
+		btrfs_err(root->fs_info,
+			"failed to write free ino cache for root %llu",
+			root->root_key.objectid);
+#endif
+	}
 
 	return ret;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 /*
- * simple code to zero out a cluster
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
  */
-void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap)
 {
-	spin_lock_init(&cluster->lock);
-	spin_lock_init(&cluster->refill_lock);
-	cluster->root = RB_ROOT;
-	cluster->max_size = 0;
-	cluster->points_to_bitmap = false;
-	INIT_LIST_HEAD(&cluster->block_group_list);
-	cluster->block_group = NULL;
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info = NULL, *bitmap_info;
+	void *map = NULL;
+	u64 bytes_added;
+	int ret;
+
+again:
+	if (!info) {
+		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+		if (!info)
+			return -ENOMEM;
+	}
+
+	if (!bitmap) {
+		spin_lock(&ctl->tree_lock);
+		info->offset = offset;
+		info->bytes = bytes;
+		ret = link_free_space(ctl, info);
+		spin_unlock(&ctl->tree_lock);
+		if (ret)
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		return ret;
+	}
+
+	if (!map) {
+		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		if (!map) {
+			kmem_cache_free(btrfs_free_space_cachep, info);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&ctl->tree_lock);
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		info->bitmap = map;
+		map = NULL;
+		add_new_bitmap(ctl, info, offset);
+		bitmap_info = info;
+	}
+
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	spin_unlock(&ctl->tree_lock);
+
+	if (bytes)
+		goto again;
+
+	if (map)
+		kfree(map);
+	return 0;
 }
 
+/*
+ * Checks to see if the given range is in the free space cache.  This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	spin_lock(&ctl->tree_lock);
+	info = tree_search_offset(ctl, offset, 0, 0);
+	if (!info) {
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					  1, 0);
+		if (!info)
+			goto out;
+	}
+
+have_info:
+	if (info->bitmap) {
+		u64 bit_off, bit_bytes;
+		struct rb_node *n;
+		struct btrfs_free_space *tmp;
+
+		bit_off = offset;
+		bit_bytes = ctl->unit;
+		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+		if (!ret) {
+			if (bit_off == offset) {
+				ret = 1;
+				goto out;
+			} else if (bit_off > offset &&
+				   offset + bytes > bit_off) {
+				ret = 1;
+				goto out;
+			}
+		}
+
+		n = rb_prev(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (tmp->offset + tmp->bytes < offset)
+				break;
+			if (offset + bytes < tmp->offset) {
+				n = rb_prev(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		n = rb_next(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (offset + bytes < tmp->offset)
+				break;
+			if (tmp->offset + tmp->bytes < offset) {
+				n = rb_next(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		goto out;
+	}
+
+	if (info->offset == offset) {
+		ret = 1;
+		goto out;
+	}
+
+	if (offset > info->offset && offset < info->offset + info->bytes)
+		ret = 1;
+out:
+	spin_unlock(&ctl->tree_lock);
+	return ret;
+}
+#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e49ca5c321b..0cf4977ef70 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,25 @@ struct btrfs_free_space {
 	struct list_head list;
 };
 
+struct btrfs_free_space_ctl {
+	spinlock_t tree_lock;
+	struct rb_root free_space_offset;
+	u64 free_space;
+	int extents_thresh;
+	int free_extents;
+	int total_bitmaps;
+	int unit;
+	u64 start;
+	struct btrfs_free_space_op *op;
+	void *private;
+};
+
+struct btrfs_free_space_op {
+	void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+	bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+			   struct btrfs_free_space *info);
+};
+
 struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				      struct btrfs_block_group_cache
 				      *block_group, struct btrfs_path *path);
@@ -35,9 +54,10 @@ int create_free_space_inode(struct btrfs_root *root,
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_path *path);
 
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
@@ -45,27 +65,60 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
 			  struct btrfs_path *path);
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 bytenr, u64 size);
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+				    struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+			struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+			      struct btrfs_trans_handle *trans,
+			      struct btrfs_path *path,
+			      struct inode *inode);
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+			   u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+		     u64 bytenr, u64 size)
+{
+	return __btrfs_add_free_space(block_group->free_space_ctl,
+				      bytenr, size);
+}
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-				   *block_group);
+				     *block_group);
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-			       u64 offset, u64 bytes, u64 empty_size);
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster, u64 bytes,
-			     u64 min_start);
+			     u64 min_start, u64 *max_extent_size);
 int btrfs_return_cluster_to_free_space(
 			       struct btrfs_block_group_cache *block_group,
 			       struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen);
+
+/* Support functions for runnint our sanity tests */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap);
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes);
+#endif
+
 #endif
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 00000000000..85889aa82c6
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include "hash.h"
+
+static struct crypto_shash *tfm;
+
+int __init btrfs_hash_init(void)
+{
+	tfm = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	return 0;
+}
+
+void btrfs_hash_exit(void)
+{
+	crypto_free_shash(tfm);
+}
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
+{
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(tfm)];
+	} desc;
+	int err;
+
+	desc.shash.tfm = tfm;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
+
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
+
+	return *(u32 *)desc.ctx;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b9..118a2316e5d 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,24 @@
 #ifndef __HASH__
 #define __HASH__
 
-#include <linux/crc32c.h>
+int __init btrfs_hash_init(void);
+
+void btrfs_hash_exit(void);
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
+
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-	return crc32c((u32)~1, name, len);
+	return btrfs_crc32c((u32)~1, name, len);
 }
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+				    int len)
+{
+	return (u64) btrfs_crc32c(parent_objectid, name, len);
+}
+
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 64f1150bb48..2be38df703c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,7 +18,9 @@
 
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
+#include "print-tree.h"
 
 static int find_name_in_backref(struct btrfs_path *path, const char *name,
 			 int name_len, struct btrfs_inode_ref **ref_ret)
@@ -49,37 +51,149 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
 	return 0;
 }
 
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod)
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+				   const char *name, int name_len,
+				   struct btrfs_inode_extref **extref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_extref *extref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	/*
+	 * Search all extended backrefs in this item. We're only
+	 * looking through any collisions so most of the time this is
+	 * just going to compare against one buffer. If all is well,
+	 * we'll return success and the inode ref object.
+	 */
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+		name_ptr = (unsigned long)(&extref->name);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+		if (ref_name_len == name_len &&
+		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+			if (extref_ret)
+				*extref_ret = extref;
+			return 1;
+		}
+
+		cur_offset += ref_name_len + sizeof(*extref);
+	}
+	return 0;
+}
+
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow)
 {
-	struct btrfs_key key;
-	struct btrfs_inode_ref *ref;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
 	int ret;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
 
 	key.objectid = inode_objectid;
-	key.type = BTRFS_INODE_REF_KEY;
-	key.offset = ref_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
 
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
 		return ERR_PTR(ret);
 	if (ret > 0)
 		return NULL;
-	if (!find_name_in_backref(path, name, name_len, &ref))
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
 		return NULL;
-	return ref;
+	return extref;
+}
+
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  const char *name, int name_len,
+				  u64 inode_objectid, u64 ref_objectid,
+				  u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	int ret;
+	int del_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Sanity check - did we find the right item for this name?
+	 * This should always succeed so error here will make the FS
+	 * readonly.
+	 */
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+					    name, name_len, &extref)) {
+		btrfs_std_error(root->fs_info, -ENOENT);
+		ret = -EROFS;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (index)
+		*index = btrfs_inode_extref_index(leaf, extref);
+
+	if (del_len == item_size) {
+		/*
+		 * Common case only one ref in the item, remove the
+		 * whole item.
+		 */
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+
+	ptr = (unsigned long)extref;
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	memmove_extent_buffer(leaf, ptr, ptr + del_len,
+			      item_size - (ptr + del_len - item_start));
+
+	btrfs_truncate_item(root, path, item_size - del_len, 1);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
 }
 
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid, u64 *index)
+			struct btrfs_root *root,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -90,6 +204,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	u32 sub_item_len;
 	int ret;
+	int search_ext_refs = 0;
 	int del_len = name_len + sizeof(*ref);
 
 	key.objectid = inode_objectid;
@@ -105,12 +220,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	} else if (ret < 0) {
 		goto out;
 	}
 	if (!find_name_in_backref(path, name, name_len, &ref)) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	}
 	leaf = path->nodes[0];
@@ -128,14 +245,84 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	ret = btrfs_truncate_item(trans, root, path,
-				  item_size - sub_item_len, 1);
-	BUG_ON(ret);
+	btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
+out:
+	btrfs_free_path(path);
+
+	if (search_ext_refs) {
+		/*
+		 * No refs were found, or we could not find the
+		 * name in our ref array. Find and remove the extended
+		 * inode ref then.
+		 */
+		return btrfs_del_inode_extref(trans, root, name, name_len,
+					      inode_objectid, ref_objectid, index);
+	}
+
+	return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     const char *name, int name_len,
+				     u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_inode_extref *extref;
+	int ret;
+	int ins_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, name_len, NULL))
+			goto out;
+
+		btrfs_extend_item(root, path, ins_len);
+		ret = 0;
+	}
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(path->slots[0]);
+	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+	ptr += btrfs_item_size(leaf, item) - ins_len;
+	extref = (struct btrfs_inode_extref *)ptr;
+
+	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+	btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+	ptr = (unsigned long)&extref->name;
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
@@ -166,8 +353,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			goto out;
 
 		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-		ret = btrfs_extend_item(trans, root, path, ins_len);
-		BUG_ON(ret);
+		btrfs_extend_item(root, path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -191,6 +377,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 
 out:
 	btrfs_free_path(path);
+
+	if (ret == -EMLINK) {
+		struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+		/* We ran out of space in the ref array. Need to
+		 * add an extended ref. */
+		if (btrfs_super_incompat_flags(disk_super)
+		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+			ret = btrfs_insert_inode_extref(trans, root, name,
+							name_len,
+							inode_objectid,
+							ref_objectid, index);
+	}
+
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb590917..888fbe19079 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -16,11 +16,497 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+
 #include "ctree.h"
 #include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
 #include "transaction.h"
 
-int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+static int caching_kthread(void *data)
+{
+	struct btrfs_root *root = data;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u64 last = (u64)-1;
+	int slot;
+	int ret;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* Since the commit root is read-only, we can safely skip locking. */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = 2;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_INODE_ITEM_KEY;
+again:
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->commit_root_sem);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		if (btrfs_fs_closing(fs_info))
+			goto out;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+
+			if (need_resched() ||
+			    btrfs_transaction_in_commit(fs_info)) {
+				leaf = path->nodes[0];
+
+				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
+					break;
+
+				/*
+				 * Save the key so we can advances forward
+				 * in the next search.
+				 */
+				btrfs_item_key_to_cpu(leaf, &key, 0);
+				btrfs_release_path(path);
+				root->cache_progress = last;
+				up_read(&fs_info->commit_root_sem);
+				schedule_timeout(1);
+				goto again;
+			} else
+				continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.type != BTRFS_INODE_ITEM_KEY)
+			goto next;
+
+		if (key.objectid >= root->highest_objectid)
+			break;
+
+		if (last != (u64)-1 && last + 1 != key.objectid) {
+			__btrfs_add_free_space(ctl, last + 1,
+					       key.objectid - last - 1);
+			wake_up(&root->cache_wait);
+		}
+
+		last = key.objectid;
+next:
+		path->slots[0]++;
+	}
+
+	if (last < root->highest_objectid - 1) {
+		__btrfs_add_free_space(ctl, last + 1,
+				       root->highest_objectid - last - 1);
+	}
+
+	spin_lock(&root->cache_lock);
+	root->cached = BTRFS_CACHE_FINISHED;
+	spin_unlock(&root->cache_lock);
+
+	root->cache_progress = (u64)-1;
+	btrfs_unpin_free_ino(root);
+out:
+	wake_up(&root->cache_wait);
+	up_read(&fs_info->commit_root_sem);
+
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+static void start_caching(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct task_struct *tsk;
+	int ret;
+	u64 objectid;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+	spin_lock(&root->cache_lock);
+	if (root->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&root->cache_lock);
+		return;
+	}
+
+	root->cached = BTRFS_CACHE_STARTED;
+	spin_unlock(&root->cache_lock);
+
+	ret = load_free_ino_cache(root->fs_info, root);
+	if (ret == 1) {
+		spin_lock(&root->cache_lock);
+		root->cached = BTRFS_CACHE_FINISHED;
+		spin_unlock(&root->cache_lock);
+		return;
+	}
+
+	/*
+	 * It can be quite time-consuming to fill the cache by searching
+	 * through the extent tree, and this can keep ino allocation path
+	 * waiting. Therefore at start we quickly find out the highest
+	 * inode number and we know we can use inode numbers which fall in
+	 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+	 */
+	ret = btrfs_find_free_objectid(root, &objectid);
+	if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+		__btrfs_add_free_space(ctl, objectid,
+				       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+	}
+
+	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
+			  root->root_key.objectid);
+	if (IS_ERR(tsk)) {
+		btrfs_warn(root->fs_info, "failed to start inode caching task");
+		btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+				"disabling inode map caching");
+	}
+}
+
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return btrfs_find_free_objectid(root, objectid);
+
+again:
+	*objectid = btrfs_find_ino_for_alloc(root);
+
+	if (*objectid != 0)
+		return 0;
+
+	start_caching(root);
+
+	wait_event(root->cache_wait,
+		   root->cached == BTRFS_CACHE_FINISHED ||
+		   root->free_ino_ctl->free_space > 0);
+
+	if (root->cached == BTRFS_CACHE_FINISHED &&
+	    root->free_ino_ctl->free_space == 0)
+		return -ENOSPC;
+	else
+		goto again;
+}
+
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+again:
+	if (root->cached == BTRFS_CACHE_FINISHED) {
+		__btrfs_add_free_space(pinned, objectid, 1);
+	} else {
+		down_write(&root->fs_info->commit_root_sem);
+		spin_lock(&root->cache_lock);
+		if (root->cached == BTRFS_CACHE_FINISHED) {
+			spin_unlock(&root->cache_lock);
+			up_write(&root->fs_info->commit_root_sem);
+			goto again;
+		}
+		spin_unlock(&root->cache_lock);
+
+		start_caching(root);
+
+		__btrfs_add_free_space(pinned, objectid, 1);
+
+		up_write(&root->fs_info->commit_root_sem);
+	}
+}
+
+/*
+ * When a transaction is committed, we'll move those inode numbers which
+ * are smaller than root->cache_progress from pinned tree to free_ino tree,
+ * and others will just be dropped, because the commit root we were
+ * searching has changed.
+ *
+ * Must be called with root->fs_info->commit_root_sem held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	u64 count;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
+	while (1) {
+		n = rb_first(rbroot);
+		if (!n)
+			break;
+
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		BUG_ON(info->bitmap); /* Logic error */
+
+		if (info->offset > root->cache_progress)
+			goto free;
+		else if (info->offset + info->bytes > root->cache_progress)
+			count = root->cache_progress - info->offset + 1;
+		else
+			count = info->bytes;
+
+		__btrfs_add_free_space(ctl, info->offset, count);
+free:
+		rb_erase(&info->offset_index, rbroot);
+		kfree(info);
+	}
+}
+
+#define INIT_THRESHOLD	(((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int max_ino;
+	int max_bitmaps;
+
+	n = rb_last(&ctl->free_space_offset);
+	if (!n) {
+		ctl->extents_thresh = INIT_THRESHOLD;
+		return;
+	}
+	info = rb_entry(n, struct btrfs_free_space, offset_index);
+
+	/*
+	 * Find the maximum inode number in the filesystem. Note we
+	 * ignore the fact that this can be a bitmap, because we are
+	 * not doing precise calculation.
+	 */
+	max_ino = info->bytes - 1;
+
+	max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+	if (max_bitmaps <= ctl->total_bitmaps) {
+		ctl->extents_thresh = 0;
+		return;
+	}
+
+	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+				PAGE_CACHE_SIZE / sizeof(*info);
+}
+
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+		       struct btrfs_free_space *info)
+{
+	if (ctl->free_extents < ctl->extents_thresh ||
+	    info->bytes > INODES_PER_BITMAP / 10)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_free_space_op free_ino_op = {
+	.recalc_thresholds	= recalculate_thresholds,
+	.use_bitmap		= use_bitmap,
+};
+
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info)
+{
+	/*
+	 * We always use extents for two reasons:
+	 *
+	 * - The pinned tree is only used during the process of caching
+	 *   work.
+	 * - Make code simpler. See btrfs_unpin_free_ino().
+	 */
+	return false;
+}
+
+static struct btrfs_free_space_op pinned_free_ino_op = {
+	.recalc_thresholds	= pinned_recalc_thresholds,
+	.use_bitmap		= pinned_use_bitmap,
+};
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	spin_lock_init(&ctl->tree_lock);
+	ctl->unit = 1;
+	ctl->start = 0;
+	ctl->private = NULL;
+	ctl->op = &free_ino_op;
+
+	/*
+	 * Initially we allow to use 16K of ram to cache chunks of
+	 * inode numbers before we resort to bitmaps. This is somewhat
+	 * arbitrary, but it will be adjusted in runtime.
+	 */
+	ctl->extents_thresh = INIT_THRESHOLD;
+
+	spin_lock_init(&pinned->tree_lock);
+	pinned->unit = 1;
+	pinned->start = 0;
+	pinned->private = NULL;
+	pinned->extents_thresh = 0;
+	pinned->op = &pinned_free_ino_op;
+}
+
+int btrfs_save_ino_cache(struct btrfs_root *root,
+			 struct btrfs_trans_handle *trans)
+{
+	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+	struct btrfs_path *path;
+	struct inode *inode;
+	struct btrfs_block_rsv *rsv;
+	u64 num_bytes;
+	u64 alloc_hint = 0;
+	int ret;
+	int prealloc;
+	bool retry = false;
+
+	/* only fs tree and subvol/snap needs ino cache */
+	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+		return 0;
+
+	/* Don't save inode cache if we are deleting this root */
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 0;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+	num_bytes = trans->bytes_reserved;
+	/*
+	 * 1 item for inode item insertion if need
+	 * 4 items for inode item update (in the worst case)
+	 * 1 items for slack space if we need do truncation
+	 * 1 item for free space object
+	 * 3 items for pre-allocation
+	 */
+	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
+	ret = btrfs_block_rsv_add(root, trans->block_rsv,
+				  trans->bytes_reserved,
+				  BTRFS_RESERVE_NO_FLUSH);
+	if (ret)
+		goto out;
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+				      trans->transid, trans->bytes_reserved, 1);
+again:
+	inode = lookup_free_ino_inode(root, path);
+	if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
+		ret = PTR_ERR(inode);
+		goto out_release;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retry); /* Logic error */
+		retry = true;
+
+		ret = create_free_ino_inode(root, trans, path);
+		if (ret)
+			goto out_release;
+		goto again;
+	}
+
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_put;
+	}
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_truncate_free_space_cache(root, trans, inode);
+		if (ret) {
+			if (ret != -ENOSPC)
+				btrfs_abort_transaction(trans, root, ret);
+			goto out_put;
+		}
+	}
+
+	spin_lock(&root->cache_lock);
+	if (root->cached != BTRFS_CACHE_FINISHED) {
+		ret = -1;
+		spin_unlock(&root->cache_lock);
+		goto out_put;
+	}
+	spin_unlock(&root->cache_lock);
+
+	spin_lock(&ctl->tree_lock);
+	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+	prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+	prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+	spin_unlock(&ctl->tree_lock);
+
+	/* Just to make sure we have enough space */
+	prealloc += 8 * PAGE_CACHE_SIZE;
+
+	ret = btrfs_delalloc_reserve_space(inode, prealloc);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+					      prealloc, prealloc, &alloc_hint);
+	if (ret) {
+		btrfs_delalloc_release_space(inode, prealloc);
+		goto out_put;
+	}
+	btrfs_free_reserved_data_space(inode, prealloc);
+
+	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
+out_put:
+	iput(inode);
+out_release:
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+				      trans->transid, trans->bytes_reserved, 0);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+out:
+	trans->block_rsv = rsv;
+	trans->bytes_reserved = num_bytes;
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -30,7 +516,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 	int slot;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
 	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
 	search_key.type = -1;
@@ -38,7 +525,7 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto error;
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Corruption */
 	if (path->slots[0] > 0) {
 		slot = path->slots[0] - 1;
 		l = path->nodes[0];
@@ -54,15 +541,14 @@ error:
 	return ret;
 }
 
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     u64 dirid, u64 *objectid)
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
 {
 	int ret;
 	mutex_lock(&root->objectid_mutex);
 
 	if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
-		ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+		ret = btrfs_find_highest_objectid(root,
+						  &root->highest_objectid);
 		if (ret)
 			goto out;
 	}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 00000000000..ddb347bfee2
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+			 struct btrfs_trans_handle *trans);
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 512c3d1da08..3668048e16f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,27 +32,36 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/aio.h>
 #include <linux/bit_spinlock.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
 #include <linux/slab.h>
-#include "compat.h"
+#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
-#include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
+#include "volumes.h"
 #include "compression.h"
 #include "locking.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "backref.h"
+#include "hash.h"
+#include "props.h"
 
 struct btrfs_iget_args {
-	u64 ino;
+	struct btrfs_key *location;
 	struct btrfs_root *root;
 };
 
@@ -67,9 +76,11 @@ static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
 
 #define S_SHIFT 12
 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,12 +93,20 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
-static void btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
+static int btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type);
+
+static int btrfs_dirty_inode(struct inode *inode);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -106,13 +125,13 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  * the btree.  The caller should have done a btrfs_drop_extents so that
  * no overlapping inline items exist in the btree
  */
-static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_path *path, int extent_inserted,
 				struct btrfs_root *root, struct inode *inode,
 				u64 start, size_t size, size_t compressed_size,
+				int compress_type,
 				struct page **compressed_pages)
 {
-	struct btrfs_key key;
-	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct page *page = NULL;
 	char *kaddr;
@@ -121,34 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int ret;
 	size_t cur_size = size;
-	size_t datasize;
 	unsigned long offset;
-	int compress_type = BTRFS_COMPRESS_NONE;
 
-	if (compressed_size && compressed_pages) {
-		compress_type = root->fs_info->compress_type;
+	if (compressed_size && compressed_pages)
 		cur_size = compressed_size;
-	}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	inode_add_bytes(inode, size);
 
-	path->leave_spinning = 1;
-	btrfs_set_trans_block_group(trans, inode);
+	if (!extent_inserted) {
+		struct btrfs_key key;
+		size_t datasize;
 
-	key.objectid = inode->i_ino;
-	key.offset = start;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		key.objectid = btrfs_ino(inode);
+		key.offset = start;
+		btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 
-	inode_add_bytes(inode, size);
-	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      datasize);
-	BUG_ON(ret);
-	if (ret) {
-		err = ret;
-		goto fail;
+		datasize = btrfs_file_extent_calc_inline_size(cur_size);
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      datasize);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -168,9 +182,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
-			kaddr = kmap_atomic(cpage, KM_USER0);
+			kaddr = kmap_atomic(cpage);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap_atomic(kaddr, KM_USER0);
+			kunmap_atomic(kaddr);
 
 			i++;
 			ptr += cur_size;
@@ -182,14 +196,14 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 		page = find_get_page(inode->i_mapping,
 				     start >> PAGE_CACHE_SHIFT);
 		btrfs_set_file_extent_compression(leaf, ei, 0);
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		offset = start & (PAGE_CACHE_SIZE - 1);
 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 		page_cache_release(page);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 
 	/*
 	 * we're an inline extent, so nobody can
@@ -201,11 +215,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	 * could end up racing with unlink.
 	 */
 	BTRFS_I(inode)->disk_i_size = inode->i_size;
-	btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, root, inode);
 
-	return 0;
+	return ret;
 fail:
-	btrfs_free_path(path);
 	return err;
 }
 
@@ -215,20 +228,22 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct inode *inode, u64 start, u64 end,
-				 size_t compressed_size,
-				 struct page **compressed_pages)
+static noinline int cow_file_range_inline(struct btrfs_root *root,
+					  struct inode *inode, u64 start,
+					  u64 end, size_t compressed_size,
+					  int compress_type,
+					  struct page **compressed_pages)
 {
+	struct btrfs_trans_handle *trans;
 	u64 isize = i_size_read(inode);
 	u64 actual_end = min(end + 1, isize);
 	u64 inline_len = actual_end - start;
-	u64 aligned_end = (end + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-	u64 hint_byte;
+	u64 aligned_end = ALIGN(end, root->sectorsize);
 	u64 data_len = inline_len;
 	int ret;
+	struct btrfs_path *path;
+	int extent_inserted = 0;
+	u32 extent_item_size;
 
 	if (compressed_size)
 		data_len = compressed_size;
@@ -243,19 +258,53 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
-				 &hint_byte, 1);
-	BUG_ON(ret);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (compressed_size && compressed_pages)
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		   compressed_size);
+	else
+		extent_item_size = btrfs_file_extent_calc_inline_size(
+		    inline_len);
+
+	ret = __btrfs_drop_extents(trans, root, inode, path,
+				   start, aligned_end, NULL,
+				   1, 1, extent_item_size, &extent_inserted);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	if (isize > actual_end)
 		inline_len = min_t(u64, isize, actual_end);
-	ret = insert_inline_extent(trans, root, inode, start,
+	ret = insert_inline_extent(trans, path, extent_inserted,
+				   root, inode, start,
 				   inline_len, compressed_size,
-				   compressed_pages);
-	BUG_ON(ret);
+				   compress_type, compressed_pages);
+	if (ret && ret != -ENOSPC) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	} else if (ret == -ENOSPC) {
+		ret = 1;
+		goto out;
+	}
+
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
-	return 0;
+out:
+	btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+	return ret;
 }
 
 struct async_extent {
@@ -288,6 +337,7 @@ static noinline int add_async_extent(struct async_cow *cow,
 	struct async_extent *async_extent;
 
 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+	BUG_ON(!async_extent); /* -ENOMEM */
 	async_extent->start = start;
 	async_extent->ram_size = ram_size;
 	async_extent->compressed_size = compressed_size;
@@ -312,7 +362,8 @@ static noinline int add_async_extent(struct async_cow *cow,
  * If this code finds it can't get good compression, it puts an
  * entry onto the work queue to write the uncompressed bytes.  This
  * makes sure that both compressed inodes and uncompressed inodes
- * are written in the same order that pdflush sent them down.
+ * are written in the same order that the flusher thread sent them
+ * down.
  */
 static noinline int compress_file_range(struct inode *inode,
 					struct page *locked_page,
@@ -321,7 +372,6 @@ static noinline int compress_file_range(struct inode *inode,
 					int *num_added)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	u64 num_bytes;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
@@ -337,6 +387,20 @@ static noinline int compress_file_range(struct inode *inode,
 	int i;
 	int will_compress;
 	int compress_type = root->fs_info->compress_type;
+	int redirty = 0;
+
+	/* if this is a small write inside eof, kick off a defrag */
+	if ((end - start + 1) < 16 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		btrfs_add_inode_defrag(NULL, inode);
+
+	/*
+	 * skip compression for a small file range(<=blocksize) that
+	 * isn't an inline extent, since it dosen't save disk space at all.
+	 */
+	if ((end - start + 1) <= blocksize &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		goto cleanup_and_bail_uncompressed;
 
 	actual_end = min_t(u64, isize, end + 1);
 again:
@@ -370,7 +434,7 @@ again:
 	 * a compressed extent to 128k.
 	 */
 	total_compressed = min(total_compressed, max_uncompressed);
-	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
 	total_in = 0;
 	ret = 0;
@@ -382,13 +446,29 @@ again:
 	 */
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
 	    (btrfs_test_opt(root, COMPRESS) ||
-	     (BTRFS_I(inode)->force_compress))) {
+	     (BTRFS_I(inode)->force_compress) ||
+	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+		if (!pages) {
+			/* just bail out to the uncompressed code */
+			goto cont;
+		}
 
 		if (BTRFS_I(inode)->force_compress)
 			compress_type = BTRFS_I(inode)->force_compress;
 
+		/*
+		 * we need to call clear_page_dirty_for_io on each
+		 * page in the range.  Otherwise applications with the file
+		 * mmap'd can wander in and change the page contents while
+		 * we are compressing them.
+		 *
+		 * If the compression fails for any reason, we set the pages
+		 * dirty again later on.
+		 */
+		extent_range_clear_dirty_for_io(inode, start, end);
+		redirty = 1;
 		ret = btrfs_compress_pages(compress_type,
 					   inode->i_mapping, start,
 					   total_compressed, pages,
@@ -407,50 +487,46 @@ again:
 			 * sending it down to disk
 			 */
 			if (offset) {
-				kaddr = kmap_atomic(page, KM_USER0);
+				kaddr = kmap_atomic(page);
 				memset(kaddr + offset, 0,
 				       PAGE_CACHE_SIZE - offset);
-				kunmap_atomic(kaddr, KM_USER0);
+				kunmap_atomic(kaddr);
 			}
 			will_compress = 1;
 		}
 	}
+cont:
 	if (start == 0) {
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		btrfs_set_trans_block_group(trans, inode);
-		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
 		/* lets try to make an inline extent */
 		if (ret || total_in < (actual_end - start)) {
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */
-			ret = cow_file_range_inline(trans, root, inode,
-						    start, end, 0, NULL);
+			ret = cow_file_range_inline(root, inode, start, end,
+						    0, 0, NULL);
 		} else {
 			/* try making a compressed inline extent */
-			ret = cow_file_range_inline(trans, root, inode,
-						    start, end,
-						    total_compressed, pages);
+			ret = cow_file_range_inline(root, inode, start, end,
+						    total_compressed,
+						    compress_type, pages);
 		}
-		if (ret == 0) {
+		if (ret <= 0) {
+			unsigned long clear_flags = EXTENT_DELALLOC |
+				EXTENT_DEFRAG;
+			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+
 			/*
-			 * inline extent creation worked, we don't need
-			 * to create any more async work items.  Unlock
-			 * and free up our temp pages.
+			 * inline extent creation worked or returned error,
+			 * we don't need to create any more async work items.
+			 * Unlock and free up our temp pages.
 			 */
-			extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, NULL,
-			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
-
-			btrfs_end_transaction(trans, root);
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+						     clear_flags, PAGE_UNLOCK |
+						     PAGE_CLEAR_DIRTY |
+						     PAGE_SET_WRITEBACK |
+						     PAGE_END_WRITEBACK);
 			goto free_pages_out;
 		}
-		btrfs_end_transaction(trans, root);
 	}
 
 	if (will_compress) {
@@ -459,15 +535,13 @@ again:
 		 * up to a block size boundary so the allocator does sane
 		 * things
 		 */
-		total_compressed = (total_compressed + blocksize - 1) &
-			~(blocksize - 1);
+		total_compressed = ALIGN(total_compressed, blocksize);
 
 		/*
 		 * one last check to make sure the compression is really a
 		 * win, compare the page count read with the blocks on disk
 		 */
-		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
-			~(PAGE_CACHE_SIZE - 1);
+		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 		if (total_compressed >= total_in) {
 			will_compress = 0;
 		} else {
@@ -525,13 +599,15 @@ cleanup_and_bail_uncompressed:
 			__set_page_dirty_nobuffers(locked_page);
 			/* unlocked later on in the async handlers */
 		}
+		if (redirty)
+			extent_range_redirty_for_io(inode, start, end);
 		add_async_extent(async_cow, start, end - start + 1,
 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
 		*num_added += 1;
 	}
 
 out:
-	return 0;
+	return ret;
 
 free_pages_out:
 	for (i = 0; i < nr_pages_ret; i++) {
@@ -554,7 +630,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
 {
 	struct async_extent *async_extent;
 	u64 alloc_hint = 0;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -565,7 +640,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 	if (list_empty(&async_cow->extents))
 		return 0;
 
-
+again:
 	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
 					  struct async_extent, list);
@@ -581,7 +656,7 @@ retry:
 
 			lock_extent(io_tree, async_extent->start,
 					 async_extent->start +
-					 async_extent->ram_size - 1, GFP_NOFS);
+					 async_extent->ram_size - 1);
 
 			/* allocate blocks */
 			ret = cow_file_range(inode, async_cow->locked_page,
@@ -590,6 +665,8 @@ retry:
 					     async_extent->ram_size - 1,
 					     &page_started, &nr_written, 0);
 
+			/* JDM XXX */
+
 			/*
 			 * if page_started, cow_file_range inserted an
 			 * inline extent and took care of all the unlocking
@@ -603,26 +680,23 @@ retry:
 						  async_extent->ram_size - 1,
 						  btrfs_get_extent,
 						  WB_SYNC_ALL);
+			else if (ret)
+				unlock_page(async_cow->locked_page);
 			kfree(async_extent);
 			cond_resched();
 			continue;
 		}
 
 		lock_extent(io_tree, async_extent->start,
-			    async_extent->start + async_extent->ram_size - 1,
-			    GFP_NOFS);
+			    async_extent->start + async_extent->ram_size - 1);
 
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		ret = btrfs_reserve_extent(trans, root,
+		ret = btrfs_reserve_extent(root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
-					   0, alloc_hint,
-					   (u64)-1, &ins, 1);
-		btrfs_end_transaction(trans, root);
-
+					   0, alloc_hint, &ins, 1, 1);
 		if (ret) {
 			int i;
+
 			for (i = 0; i < async_extent->nr_pages; i++) {
 				WARN_ON(async_extent->pages[i]->mapping);
 				page_cache_release(async_extent->pages[i]);
@@ -630,10 +704,14 @@ retry:
 			kfree(async_extent->pages);
 			async_extent->nr_pages = 0;
 			async_extent->pages = NULL;
-			unlock_extent(io_tree, async_extent->start,
-				      async_extent->start +
-				      async_extent->ram_size - 1, GFP_NOFS);
-			goto retry;
+
+			if (ret == -ENOSPC) {
+				unlock_extent(io_tree, async_extent->start,
+					      async_extent->start +
+					      async_extent->ram_size - 1);
+				goto retry;
+			}
+			goto out_free;
 		}
 
 		/*
@@ -644,22 +722,30 @@ retry:
 					async_extent->start +
 					async_extent->ram_size - 1, 0);
 
-		em = alloc_extent_map(GFP_NOFS);
-		BUG_ON(!em);
+		em = alloc_extent_map();
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_free_reserve;
+		}
 		em->start = async_extent->start;
 		em->len = async_extent->ram_size;
 		em->orig_start = em->start;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = async_extent->ram_size;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		em->compress_type = async_extent->compress_type;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -670,6 +756,9 @@ retry:
 						async_extent->ram_size - 1, 0);
 		}
 
+		if (ret)
+			goto out_free_reserve;
+
 		ret = btrfs_add_ordered_extent_compress(inode,
 						async_extent->start,
 						ins.objectid,
@@ -677,35 +766,45 @@ retry:
 						ins.offset,
 						BTRFS_ORDERED_COMPRESSED,
 						async_extent->compress_type);
-		BUG_ON(ret);
+		if (ret)
+			goto out_free_reserve;
 
 		/*
 		 * clear dirty, set writeback and unlock the pages.
 		 */
-		extent_clear_unlock_delalloc(inode,
-				&BTRFS_I(inode)->io_tree,
-				async_extent->start,
+		extent_clear_unlock_delalloc(inode, async_extent->start,
 				async_extent->start +
 				async_extent->ram_size - 1,
-				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
-				EXTENT_CLEAR_UNLOCK |
-				EXTENT_CLEAR_DELALLOC |
-				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
-
+				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				PAGE_SET_WRITEBACK);
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
 				    async_extent->ram_size,
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
 				    async_extent->nr_pages);
-
-		BUG_ON(ret);
 		alloc_hint = ins.objectid + ins.offset;
 		kfree(async_extent);
+		if (ret)
+			goto out;
 		cond_resched();
 	}
-
-	return 0;
+	ret = 0;
+out:
+	return ret;
+out_free_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+out_free:
+	extent_clear_unlock_delalloc(inode, async_extent->start,
+				     async_extent->start +
+				     async_extent->ram_size - 1,
+				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+	kfree(async_extent);
+	goto again;
 }
 
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -760,7 +859,6 @@ static noinline int cow_file_range(struct inode *inode,
 				   int unlock)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
 	unsigned long ram_size;
@@ -772,42 +870,43 @@ static noinline int cow_file_range(struct inode *inode,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
-	BUG_ON(root == root->fs_info->tree_root);
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+	if (btrfs_is_free_space_inode(inode)) {
+		WARN_ON_ONCE(1);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 
-	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
 	disk_num_bytes = num_bytes;
-	ret = 0;
+
+	/* if this is a small write inside eof, kick off defrag */
+	if (num_bytes < 64 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		btrfs_add_inode_defrag(NULL, inode);
 
 	if (start == 0) {
 		/* lets try to make an inline extent */
-		ret = cow_file_range_inline(trans, root, inode,
-					    start, end, 0, NULL);
+		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
+					    NULL);
 		if (ret == 0) {
-			extent_clear_unlock_delalloc(inode,
-				     &BTRFS_I(inode)->io_tree,
-				     start, end, NULL,
-				     EXTENT_CLEAR_UNLOCK_PAGE |
-				     EXTENT_CLEAR_UNLOCK |
-				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_CLEAR_DIRTY |
-				     EXTENT_SET_WRITEBACK |
-				     EXTENT_END_WRITEBACK);
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+				     EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG, PAGE_UNLOCK |
+				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+				     PAGE_END_WRITEBACK);
 
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
-			ret = 0;
 			goto out;
+		} else if (ret < 0) {
+			goto out_unlock;
 		}
 	}
 
 	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+	       btrfs_super_total_bytes(root->fs_info->super_copy));
 
 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -816,26 +915,35 @@ static noinline int cow_file_range(struct inode *inode,
 		unsigned long op;
 
 		cur_alloc_size = disk_num_bytes;
-		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+		ret = btrfs_reserve_extent(root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
-					   (u64)-1, &ins, 1);
-		BUG_ON(ret);
+					   &ins, 1, 1);
+		if (ret < 0)
+			goto out_unlock;
 
-		em = alloc_extent_map(GFP_NOFS);
-		BUG_ON(!em);
+		em = alloc_extent_map();
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_reserve;
+		}
 		em->start = start;
 		em->orig_start = em->start;
 		ram_size = ins.offset;
 		em->len = ins.offset;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = ram_size;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -844,17 +952,21 @@ static noinline int cow_file_range(struct inode *inode,
 			btrfs_drop_extent_cache(inode, start,
 						start + ram_size - 1, 0);
 		}
+		if (ret)
+			goto out_reserve;
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 					       ram_size, cur_alloc_size, 0);
-		BUG_ON(ret);
+		if (ret)
+			goto out_reserve;
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, start,
 						      cur_alloc_size);
-			BUG_ON(ret);
+			if (ret)
+				goto out_reserve;
 		}
 
 		if (disk_num_bytes < cur_alloc_size)
@@ -867,23 +979,30 @@ static noinline int cow_file_range(struct inode *inode,
 		 * Do set the Private2 bit so we know this page was properly
 		 * setup for writepage
 		 */
-		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
-		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
-			EXTENT_SET_PRIVATE2;
+		op = unlock ? PAGE_UNLOCK : 0;
+		op |= PAGE_SET_PRIVATE2;
 
-		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-					     start, start + ram_size - 1,
-					     locked_page, op);
+		extent_clear_unlock_delalloc(inode, start,
+					     start + ram_size - 1, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC,
+					     op);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
 out:
-	ret = 0;
-	btrfs_end_transaction(trans, root);
-
 	return ret;
+
+out_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+out_unlock:
+	extent_clear_unlock_delalloc(inode, start, end, locked_page,
+				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				     EXTENT_DELALLOC | EXTENT_DEFRAG,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+	goto out;
 }
 
 /*
@@ -898,8 +1017,10 @@ static noinline void async_cow_start(struct btrfs_work *work)
 	compress_file_range(async_cow->inode, async_cow->locked_page,
 			    async_cow->start, async_cow->end, async_cow,
 			    &num_added);
-	if (num_added == 0)
+	if (num_added == 0) {
+		btrfs_add_delayed_iput(async_cow->inode);
 		async_cow->inode = NULL;
+	}
 }
 
 /*
@@ -917,10 +1038,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
 		PAGE_CACHE_SHIFT;
 
-	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
-
-	if (atomic_read(&root->fs_info->async_delalloc_pages) <
-	    5 * 1042 * 1024 &&
+	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
+	    5 * 1024 * 1024 &&
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
@@ -932,6 +1051,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
 {
 	struct async_cow *async_cow;
 	async_cow = container_of(work, struct async_cow, work);
+	if (async_cow->inode)
+		btrfs_add_delayed_iput(async_cow->inode);
 	kfree(async_cow);
 }
 
@@ -943,13 +1064,14 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	unsigned long nr_pages;
 	u64 cur_end;
-	int limit = 10 * 1024 * 1042;
+	int limit = 10 * 1024 * 1024;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
 			 1, 0, NULL, GFP_NOFS);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
-		async_cow->inode = inode;
+		BUG_ON(!async_cow); /* -ENOMEM */
+		async_cow->inode = igrab(inode);
 		async_cow->root = root;
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
@@ -962,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->end = cur_end;
 		INIT_LIST_HEAD(&async_cow->extents);
 
-		async_cow->work.func = async_cow_start;
-		async_cow->work.ordered_func = async_cow_submit;
-		async_cow->work.ordered_free = async_cow_free;
-		async_cow->work.flags = 0;
+		btrfs_init_work(&async_cow->work, async_cow_start,
+				async_cow_submit, async_cow_free);
 
 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
 			PAGE_CACHE_SHIFT;
 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
 
-		btrfs_queue_worker(&root->fs_info->delalloc_workers,
-				   &async_cow->work);
+		btrfs_queue_work(root->fs_info->delalloc_workers,
+				 &async_cow->work);
 
 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
 			wait_event(root->fs_info->async_submit_wait,
@@ -1002,7 +1122,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
 	LIST_HEAD(list);
 
 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
-				       bytenr + num_bytes - 1, &list);
+				       bytenr + num_bytes - 1, &list, 0);
 	if (ret == 0 && list_empty(&list))
 		return 0;
 
@@ -1038,34 +1158,61 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	u64 extent_offset;
 	u64 disk_bytenr;
 	u64 num_bytes;
+	u64 disk_num_bytes;
+	u64 ram_bytes;
 	int extent_type;
-	int ret;
+	int ret, err;
 	int type;
 	int nocow;
 	int check_prev = 1;
-	bool nolock = false;
+	bool nolock;
+	u64 ino = btrfs_ino(inode);
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	if (root == root->fs_info->tree_root) {
-		nolock = true;
-		trans = btrfs_join_transaction_nolock(root, 1);
-	} else {
-		trans = btrfs_join_transaction(root, 1);
+	if (!path) {
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+		return -ENOMEM;
+	}
+
+	nolock = btrfs_is_free_space_inode(inode);
+
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root);
+	else
+		trans = btrfs_join_transaction(root);
+
+	if (IS_ERR(trans)) {
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
 	}
-	BUG_ON(IS_ERR(trans));
+
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	cow_start = (u64)-1;
 	cur_offset = start;
 	while (1) {
-		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+		ret = btrfs_lookup_file_extent(trans, root, path, ino,
 					       cur_offset, 0);
-		BUG_ON(ret < 0);
+		if (ret < 0)
+			goto error;
 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &found_key,
 					      path->slots[0] - 1);
-			if (found_key.objectid == inode->i_ino &&
+			if (found_key.objectid == ino &&
 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
 				path->slots[0]--;
 		}
@@ -1075,7 +1222,7 @@ next_slot:
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				BUG_ON(1);
+				goto error;
 			if (ret > 0)
 				break;
 			leaf = path->nodes[0];
@@ -1086,7 +1233,7 @@ next_slot:
 		num_bytes = 0;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-		if (found_key.objectid > inode->i_ino ||
+		if (found_key.objectid > ino ||
 		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
 		    found_key.offset > end)
 			break;
@@ -1101,12 +1248,15 @@ next_slot:
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
 
+		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
+			disk_num_bytes =
+				btrfs_file_extent_disk_num_bytes(leaf, fi);
 			if (extent_end <= start) {
 				path->slots[0]++;
 				goto next_slot;
@@ -1121,7 +1271,7 @@ next_slot:
 				goto out_check;
 			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
-			if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+			if (btrfs_cross_ref_exist(trans, root, ino,
 						  found_key.offset -
 						  extent_offset, disk_bytenr))
 				goto out_check;
@@ -1129,6 +1279,15 @@ next_slot:
 			disk_bytenr += cur_offset - found_key.offset;
 			num_bytes = min(end + 1, extent_end) - cur_offset;
 			/*
+			 * if there are pending snapshots for this root,
+			 * we fall into common COW way.
+			 */
+			if (!nolock) {
+				err = btrfs_start_nocow_write(root);
+				if (!err)
+					goto out_check;
+			}
+			/*
 			 * force cow if csum exists in the range.
 			 * this ensure that csum for a given extent are
 			 * either valid or do not exist.
@@ -1138,7 +1297,8 @@ next_slot:
 			nocow = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = found_key.offset +
-				btrfs_file_extent_inline_len(leaf, fi);
+				btrfs_file_extent_inline_len(leaf,
+						     path->slots[0], fi);
 			extent_end = ALIGN(extent_end, root->sectorsize);
 		} else {
 			BUG_ON(1);
@@ -1146,6 +1306,8 @@ next_slot:
 out_check:
 		if (extent_end <= start) {
 			path->slots[0]++;
+			if (!nolock && nocow)
+				btrfs_end_nocow_write(root);
 			goto next_slot;
 		}
 		if (!nocow) {
@@ -1158,12 +1320,16 @@ out_check:
 			goto next_slot;
 		}
 
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		if (cow_start != (u64)-1) {
-			ret = cow_file_range(inode, locked_page, cow_start,
-					found_key.offset - 1, page_started,
-					nr_written, 1);
-			BUG_ON(ret);
+			ret = cow_file_range(inode, locked_page,
+					     cow_start, found_key.offset - 1,
+					     page_started, nr_written, 1);
+			if (ret) {
+				if (!nolock && nocow)
+					btrfs_end_nocow_write(root);
+				goto error;
+			}
 			cow_start = (u64)-1;
 		}
 
@@ -1171,18 +1337,24 @@ out_check:
 			struct extent_map *em;
 			struct extent_map_tree *em_tree;
 			em_tree = &BTRFS_I(inode)->extent_tree;
-			em = alloc_extent_map(GFP_NOFS);
-			BUG_ON(!em);
+			em = alloc_extent_map();
+			BUG_ON(!em); /* -ENOMEM */
 			em->start = cur_offset;
-			em->orig_start = em->start;
+			em->orig_start = found_key.offset - extent_offset;
 			em->len = num_bytes;
 			em->block_len = num_bytes;
 			em->block_start = disk_bytenr;
+			em->orig_block_len = disk_num_bytes;
+			em->ram_bytes = ram_bytes;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			em->mod_start = em->start;
+			em->mod_len = em->len;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			set_bit(EXTENT_FLAG_FILLING, &em->flags);
+			em->generation = -1;
 			while (1) {
 				write_lock(&em_tree->lock);
-				ret = add_extent_mapping(em_tree, em);
+				ret = add_extent_mapping(em_tree, em, 1);
 				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
@@ -1198,43 +1370,59 @@ out_check:
 
 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
 					       num_bytes, num_bytes, type);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM */
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
 						      num_bytes);
-			BUG_ON(ret);
+			if (ret) {
+				if (!nolock && nocow)
+					btrfs_end_nocow_write(root);
+				goto error;
+			}
 		}
 
-		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				cur_offset, cur_offset + num_bytes - 1,
-				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
-				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
-				EXTENT_SET_PRIVATE2);
+		extent_clear_unlock_delalloc(inode, cur_offset,
+					     cur_offset + num_bytes - 1,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC, PAGE_UNLOCK |
+					     PAGE_SET_PRIVATE2);
+		if (!nolock && nocow)
+			btrfs_end_nocow_write(root);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-	if (cur_offset <= end && cow_start == (u64)-1)
+	if (cur_offset <= end && cow_start == (u64)-1) {
 		cow_start = cur_offset;
+		cur_offset = end;
+	}
+
 	if (cow_start != (u64)-1) {
 		ret = cow_file_range(inode, locked_page, cow_start, end,
 				     page_started, nr_written, 1);
-		BUG_ON(ret);
+		if (ret)
+			goto error;
 	}
 
-	if (nolock) {
-		ret = btrfs_end_transaction_nolock(trans, root);
-		BUG_ON(ret);
-	} else {
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
-	}
+error:
+	err = btrfs_end_transaction(trans, root);
+	if (!ret)
+		ret = err;
+
+	if (ret && cur_offset < end)
+		extent_clear_unlock_delalloc(inode, cur_offset, end,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC | EXTENT_DEFRAG |
+					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 /*
@@ -1247,31 +1435,36 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 1, nr_written);
-	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	else if (!btrfs_test_opt(root, COMPRESS) &&
-		 !(BTRFS_I(inode)->force_compress))
+	} else if (!btrfs_test_opt(root, COMPRESS) &&
+		   !(BTRFS_I(inode)->force_compress) &&
+		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
 		ret = cow_file_range(inode, locked_page, start, end,
 				      page_started, nr_written, 1);
-	else
+	} else {
+		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			&BTRFS_I(inode)->runtime_flags);
 		ret = cow_file_range_async(inode, locked_page, start, end,
 					   page_started, nr_written);
+	}
 	return ret;
 }
 
-static int btrfs_split_extent_hook(struct inode *inode,
-				   struct extent_state *orig, u64 split)
+static void btrfs_split_extent_hook(struct inode *inode,
+				    struct extent_state *orig, u64 split)
 {
 	/* not delalloc, ignore it */
 	if (!(orig->state & EXTENT_DELALLOC))
-		return 0;
+		return;
 
-	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-	return 0;
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->outstanding_extents++;
+	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
 /*
@@ -1280,16 +1473,57 @@ static int btrfs_split_extent_hook(struct inode *inode,
  * extents, such as when we are doing sequential writes, so we can properly
  * account for the metadata space we'll need.
  */
-static int btrfs_merge_extent_hook(struct inode *inode,
-				   struct extent_state *new,
-				   struct extent_state *other)
+static void btrfs_merge_extent_hook(struct inode *inode,
+				    struct extent_state *new,
+				    struct extent_state *other)
 {
 	/* not delalloc, ignore it */
 	if (!(other->state & EXTENT_DELALLOC))
-		return 0;
+		return;
 
-	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-	return 0;
+	spin_lock(&BTRFS_I(inode)->lock);
+	BTRFS_I(inode)->outstanding_extents--;
+	spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+				      struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+			      &root->delalloc_inodes);
+		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			&BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes++;
+		if (root->nr_delalloc_inodes == 1) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(!list_empty(&root->delalloc_root));
+			list_add_tail(&root->delalloc_root,
+				      &root->fs_info->delalloc_roots);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+				     struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes--;
+		if (!root->nr_delalloc_inodes) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(list_empty(&root->delalloc_root));
+			list_del_init(&root->delalloc_root);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
 }
 
 /*
@@ -1297,91 +1531,99 @@ static int btrfs_merge_extent_hook(struct inode *inode,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-static int btrfs_set_bit_hook(struct inode *inode,
-			      struct extent_state *state, int *bits)
+static void btrfs_set_bit_hook(struct inode *inode,
+			       struct extent_state *state, unsigned long *bits)
 {
 
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
-	 * but in this case, we are only testeing for the DELALLOC
+	 * but in this case, we are only testing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
-		int do_list = (root->root_key.objectid !=
-			       BTRFS_ROOT_TREE_OBJECTID);
+		bool do_list = !btrfs_is_free_space_inode(inode);
 
-		if (*bits & EXTENT_FIRST_DELALLOC)
+		if (*bits & EXTENT_FIRST_DELALLOC) {
 			*bits &= ~EXTENT_FIRST_DELALLOC;
-		else
-			atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+		} else {
+			spin_lock(&BTRFS_I(inode)->lock);
+			BTRFS_I(inode)->outstanding_extents++;
+			spin_unlock(&BTRFS_I(inode)->lock);
+		}
 
-		spin_lock(&root->fs_info->delalloc_lock);
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes += len;
-		root->fs_info->delalloc_bytes += len;
-		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-				      &root->fs_info->delalloc_inodes);
-		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+					 &BTRFS_I(inode)->runtime_flags))
+			btrfs_add_delalloc_inodes(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
 	}
-	return 0;
 }
 
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-static int btrfs_clear_bit_hook(struct inode *inode,
-				struct extent_state *state, int *bits)
+static void btrfs_clear_bit_hook(struct inode *inode,
+				 struct extent_state *state,
+				 unsigned long *bits)
 {
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
-	 * but in this case, we are only testeing for the DELALLOC
+	 * but in this case, we are only testing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
-		int do_list = (root->root_key.objectid !=
-			       BTRFS_ROOT_TREE_OBJECTID);
+		bool do_list = !btrfs_is_free_space_inode(inode);
 
-		if (*bits & EXTENT_FIRST_DELALLOC)
+		if (*bits & EXTENT_FIRST_DELALLOC) {
 			*bits &= ~EXTENT_FIRST_DELALLOC;
-		else if (!(*bits & EXTENT_DO_ACCOUNTING))
-			atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+			spin_lock(&BTRFS_I(inode)->lock);
+			BTRFS_I(inode)->outstanding_extents--;
+			spin_unlock(&BTRFS_I(inode)->lock);
+		}
 
-		if (*bits & EXTENT_DO_ACCOUNTING)
+		/*
+		 * We don't reserve metadata space for space cache inodes so we
+		 * don't need to call dellalloc_release_metadata if there is an
+		 * error.
+		 */
+		if (*bits & EXTENT_DO_ACCOUNTING &&
+		    root != root->fs_info->tree_root)
 			btrfs_delalloc_release_metadata(inode, len);
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-		    && do_list)
+		    && do_list && !(state->state & EXTENT_NORESERVE))
 			btrfs_free_reserved_data_space(inode, len);
 
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes -= len;
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes -= len;
-
 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			     &BTRFS_I(inode)->runtime_flags))
+			btrfs_del_delalloc_inode(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
 	}
-	return 0;
 }
 
 /*
  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
  * we don't create bios that span stripes or chunks
  */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct btrfs_mapping_tree *map_tree;
-	u64 logical = (u64)bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	int ret;
@@ -1389,15 +1631,15 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	if (bio_flags & EXTENT_BIO_COMPRESSED)
 		return 0;
 
-	length = bio->bi_size;
-	map_tree = &root->fs_info->mapping_tree;
+	length = bio->bi_iter.bi_size;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, READ, logical,
+	ret = btrfs_map_block(root->fs_info, rw, logical,
 			      &map_length, NULL, 0);
-
+	/* Will always return 0 with map_multi == NULL */
+	BUG_ON(ret < 0);
 	if (map_length < length + size)
 		return 1;
-	return ret;
+	return 0;
 }
 
 /*
@@ -1417,7 +1659,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 	int ret = 0;
 
 	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 	return 0;
 }
 
@@ -1434,7 +1676,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	int ret;
+
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 /*
@@ -1448,36 +1695,54 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	int skip_sum;
+	int metadata = 0;
+	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	if (root == root->fs_info->tree_root)
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
-	else
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-	BUG_ON(ret);
+	if (btrfs_is_free_space_inode(inode))
+		metadata = 2;
 
 	if (!(rw & REQ_WRITE)) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+		if (ret)
+			goto out;
+
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
-			return btrfs_submit_compressed_read(inode, bio,
-						    mirror_num, bio_flags);
-		} else if (!skip_sum)
-			btrfs_lookup_bio_sums(root, inode, bio, NULL);
+			ret = btrfs_submit_compressed_read(inode, bio,
+							   mirror_num,
+							   bio_flags);
+			goto out;
+		} else if (!skip_sum) {
+			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+			if (ret)
+				goto out;
+		}
 		goto mapit;
-	} else if (!skip_sum) {
+	} else if (async && !skip_sum) {
 		/* csum items have already been cloned */
 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 			goto mapit;
 		/* we're doing a write, do the async checksumming */
-		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
 				   bio_flags, bio_offset,
 				   __btrfs_submit_bio_start,
 				   __btrfs_submit_bio_done);
+		goto out;
+	} else if (!skip_sum) {
+		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+		if (ret)
+			goto out;
 	}
 
 mapit:
-	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+	if (ret < 0)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 /*
@@ -1490,11 +1755,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_ordered_sum *sum;
 
-	btrfs_set_trans_block_group(trans, inode);
-
 	list_for_each_entry(sum, list, list) {
+		trans->adding_csums = 1;
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+		trans->adding_csums = 0;
 	}
 	return 0;
 }
@@ -1502,8 +1767,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state)
 {
-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
-		WARN_ON(1);
+	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   cached_state, GFP_NOFS);
 }
@@ -1523,6 +1787,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 	struct inode *inode;
 	u64 page_start;
 	u64 page_end;
+	int ret;
 
 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
 	page = fixup->page;
@@ -1538,7 +1803,7 @@ again:
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
-			 &cached_state, GFP_NOFS);
+			 &cached_state);
 
 	/* already ordered? We're done */
 	if (PagePrivate2(page))
@@ -1550,12 +1815,21 @@ again:
 				     page_end, &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
 
-	BUG();
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		end_extent_writepage(page, ret, page_start, page_end);
+		ClearPageChecked(page);
+		goto out;
+	 }
+
 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
 	ClearPageChecked(page);
+	set_page_dirty(page);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			     &cached_state, GFP_NOFS);
@@ -1595,10 +1869,10 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 
 	SetPageChecked(page);
 	page_cache_get(page);
-	fixup->work.func = btrfs_writepage_fixup_worker;
+	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
 	fixup->page = page;
-	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
-	return -EAGAIN;
+	btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
+	return -EBUSY;
 }
 
 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -1613,13 +1887,12 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key ins;
-	u64 hint;
+	int extent_inserted = 0;
 	int ret;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	path->leave_spinning = 1;
+	if (!path)
+		return -ENOMEM;
 
 	/*
 	 * we may be replacing one extent in the tree with another.
@@ -1630,15 +1903,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
-				 &hint, 0);
-	BUG_ON(ret);
+	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
+				   file_pos + num_bytes, NULL, 0,
+				   1, sizeof(*fi), &extent_inserted);
+	if (ret)
+		goto out;
 
-	ins.objectid = inode->i_ino;
-	ins.offset = file_pos;
-	ins.type = BTRFS_EXTENT_DATA_KEY;
-	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
-	BUG_ON(ret);
+	if (!extent_inserted) {
+		ins.objectid = btrfs_ino(inode);
+		ins.offset = file_pos;
+		ins.type = BTRFS_EXTENT_DATA_KEY;
+
+		path->leave_spinning = 1;
+		ret = btrfs_insert_empty_item(trans, root, path, &ins,
+					      sizeof(*fi));
+		if (ret)
+			goto out;
+	}
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
@@ -1653,10 +1934,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
 
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_set_lock_blocking(leaf);
-
 	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
 
 	inode_add_bytes(inode, num_bytes);
 
@@ -1665,69 +1944,756 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_file_extent(trans, root,
 					root->root_key.objectid,
-					inode->i_ino, file_pos, &ins);
-	BUG_ON(ret);
+					btrfs_ino(inode), file_pos, &ins);
+out:
 	btrfs_free_path(path);
 
+	return ret;
+}
+
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+	struct rb_node node;
+	struct old_sa_defrag_extent *old;
+	u64 root_id;
+	u64 inum;
+	u64 file_pos;
+	u64 extent_offset;
+	u64 num_bytes;
+	u64 generation;
+};
+
+struct old_sa_defrag_extent {
+	struct list_head list;
+	struct new_sa_defrag_extent *new;
+
+	u64 extent_offset;
+	u64 bytenr;
+	u64 offset;
+	u64 len;
+	int count;
+};
+
+struct new_sa_defrag_extent {
+	struct rb_root root;
+	struct list_head head;
+	struct btrfs_path *path;
+	struct inode *inode;
+	u64 file_pos;
+	u64 len;
+	u64 bytenr;
+	u64 disk_len;
+	u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+			struct sa_defrag_extent_backref *b2)
+{
+	if (b1->root_id < b2->root_id)
+		return -1;
+	else if (b1->root_id > b2->root_id)
+		return 1;
+
+	if (b1->inum < b2->inum)
+		return -1;
+	else if (b1->inum > b2->inum)
+		return 1;
+
+	if (b1->file_pos < b2->file_pos)
+		return -1;
+	else if (b1->file_pos > b2->file_pos)
+		return 1;
+
+	/*
+	 * [------------------------------] ===> (a range of space)
+	 *     |<--->|   |<---->| =============> (fs/file tree A)
+	 * |<---------------------------->| ===> (fs/file tree B)
+	 *
+	 * A range of space can refer to two file extents in one tree while
+	 * refer to only one file extent in another tree.
+	 *
+	 * So we may process a disk offset more than one time(two extents in A)
+	 * and locate at the same extent(one extent in B), then insert two same
+	 * backrefs(both refer to the extent in B).
+	 */
 	return 0;
 }
 
+static void backref_insert(struct rb_root *root,
+			   struct sa_defrag_extent_backref *backref)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sa_defrag_extent_backref *entry;
+	int ret;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+		ret = backref_comp(backref, entry);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&backref->node, parent, p);
+	rb_insert_color(&backref->node, root);
+}
+
 /*
- * helper function for btrfs_finish_ordered_io, this
- * just reads in some of the csum leaves to prime them into ram
- * before we start the transaction.  It limits the amount of btree
- * reads required while inside the transaction.
+ * Note the backref might has changed, and in this case we just return 0.
  */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+				       void *ctx)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_fs_info *fs_info;
+	struct old_sa_defrag_extent *old = ctx;
+	struct new_sa_defrag_extent *new = old->new;
+	struct btrfs_path *path = new->path;
+	struct btrfs_key key;
+	struct btrfs_root *root;
+	struct sa_defrag_extent_backref *backref;
+	struct extent_buffer *leaf;
+	struct inode *inode = new->inode;
+	int slot;
+	int ret;
+	u64 extent_offset;
+	u64 num_bytes;
+
+	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+	    inum == btrfs_ino(inode))
+		return 0;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(inode)->root->fs_info;
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		WARN_ON(1);
+		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+			 inum, offset, root_id);
+		return PTR_ERR(root);
+	}
+
+	key.objectid = inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (offset > (u64)-1 << 32)
+		key.offset = 0;
+	else
+		key.offset = offset;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = 0;
+
+	while (1) {
+		cond_resched();
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			continue;
+		}
+
+		path->slots[0]++;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid > inum)
+			goto out;
+
+		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+			continue;
+
+		/*
+		 * 'offset' refers to the exact key.offset,
+		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
+		 * (key.offset - extent_offset).
+		 */
+		if (key.offset != offset)
+			continue;
+
+		extent_offset = btrfs_file_extent_offset(leaf, extent);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+
+		if (extent_offset >= old->extent_offset + old->offset +
+		    old->len || extent_offset + num_bytes <=
+		    old->extent_offset + old->offset)
+			continue;
+		break;
+	}
+
+	backref = kmalloc(sizeof(*backref), GFP_NOFS);
+	if (!backref) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	backref->root_id = root_id;
+	backref->inum = inum;
+	backref->file_pos = offset;
+	backref->num_bytes = num_bytes;
+	backref->extent_offset = extent_offset;
+	backref->generation = btrfs_file_extent_generation(leaf, extent);
+	backref->old = old;
+	backref_insert(&new->root, backref);
+	old->count++;
+out:
+	btrfs_release_path(path);
+	WARN_ON(ret);
+	return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+				   struct new_sa_defrag_extent *new)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+	struct old_sa_defrag_extent *old, *tmp;
+	int ret;
+
+	new->path = path;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		ret = iterate_inodes_from_logical(old->bytenr +
+						  old->extent_offset, fs_info,
+						  path, record_one_backref,
+						  old);
+		if (ret < 0 && ret != -ENOENT)
+			return false;
+
+		/* no backref to be processed for this extent */
+		if (!old->count) {
+			list_del(&old->list);
+			kfree(old);
+		}
+	}
+
+	if (list_empty(&new->head))
+		return false;
+
+	return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+			      struct btrfs_file_extent_item *fi,
+			      struct new_sa_defrag_extent *new)
+{
+	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
+		return 0;
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
+		return 0;
+
+	if (btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+				 struct sa_defrag_extent_backref *prev,
+				 struct sa_defrag_extent_backref *backref)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct old_sa_defrag_extent *old = backref->old;
+	struct new_sa_defrag_extent *new = old->new;
+	struct inode *src_inode = new->inode;
+	struct inode *inode;
+	struct extent_state *cached = NULL;
+	int ret = 0;
+	u64 start;
+	u64 len;
+	u64 lock_start;
+	u64 lock_end;
+	bool merge = false;
+	int index;
+
+	if (prev && prev->root_id == backref->root_id &&
+	    prev->inum == backref->inum &&
+	    prev->file_pos + prev->num_bytes == backref->file_pos)
+		merge = true;
+
+	/* step 1: get root */
+	key.objectid = backref->root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(src_inode)->root->fs_info;
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		return PTR_ERR(root);
+	}
+
+	if (btrfs_root_readonly(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
+	/* step 2: get inode */
+	key.objectid = backref->inum;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	/* step 3: relink backref */
+	lock_start = backref->file_pos;
+	lock_end = backref->file_pos + backref->num_bytes - 1;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			 0, &cached);
+
+	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	key.objectid = backref->inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = backref->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out_free_path;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out_free_path;
+	}
+
+	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+	    backref->generation)
+		goto out_free_path;
+
+	btrfs_release_path(path);
+
+	start = backref->file_pos;
+	if (backref->extent_offset < old->extent_offset + old->offset)
+		start += old->extent_offset + old->offset -
+			 backref->extent_offset;
+
+	len = min(backref->extent_offset + backref->num_bytes,
+		  old->extent_offset + old->offset + old->len);
+	len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 start + len, 1);
+	if (ret)
+		goto out_free_path;
+again:
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	path->leave_spinning = 1;
+	if (merge) {
+		struct btrfs_file_extent_item *fi;
+		u64 extent_len;
+		struct btrfs_key found_key;
+
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto out_free_path;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+		if (extent_len + found_key.offset == start &&
+		    relink_is_mergable(leaf, fi, new)) {
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len + len);
+			btrfs_mark_buffer_dirty(leaf);
+			inode_add_bytes(inode, len);
+
+			ret = 1;
+			goto out_free_path;
+		} else {
+			merge = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+					sizeof(*extent));
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+	btrfs_set_file_extent_num_bytes(leaf, item, len);
+	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+	btrfs_set_file_extent_encryption(leaf, item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+	inode_add_bytes(inode, len);
+	btrfs_release_path(path);
+
+	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+			new->disk_len, 0,
+			backref->root_id, backref->inum,
+			new->file_pos, 0);	/* start - extent_offset */
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	ret = 1;
+out_free_path:
+	btrfs_release_path(path);
+	path->leave_spinning = 0;
+	btrfs_end_transaction(trans, root);
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			     &cached, GFP_NOFS);
+	iput(inode);
+	return ret;
+}
+
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+	struct old_sa_defrag_extent *old, *tmp;
+
+	if (!new)
+		return;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+	kfree(new);
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+	struct btrfs_path *path;
+	struct sa_defrag_extent_backref *backref;
+	struct sa_defrag_extent_backref *prev = NULL;
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct rb_node *node;
+	int ret;
+
+	inode = new->inode;
+	root = BTRFS_I(inode)->root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+
+	if (!record_extent_backrefs(path, new)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	while (1) {
+		node = rb_first(&new->root);
+		if (!node)
+			break;
+		rb_erase(node, &new->root);
+
+		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+		ret = relink_extent_backref(path, prev, backref);
+		WARN_ON(ret < 0);
+
+		kfree(prev);
+
+		if (ret == 1)
+			prev = backref;
+		else
+			prev = NULL;
+		cond_resched();
+	}
+	kfree(prev);
+
+	btrfs_free_path(path);
+out:
+	free_sa_defrag_extent(new);
+
+	atomic_dec(&root->fs_info->defrag_running);
+	wake_up(&root->fs_info->transaction_wait);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+			struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct old_sa_defrag_extent *old;
+	struct new_sa_defrag_extent *new;
+	int ret;
+
+	new = kmalloc(sizeof(*new), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->inode = inode;
+	new->file_pos = ordered->file_offset;
+	new->len = ordered->len;
+	new->bytenr = ordered->start;
+	new->disk_len = ordered->disk_len;
+	new->compress_type = ordered->compress_type;
+	new->root = RB_ROOT;
+	INIT_LIST_HEAD(&new->head);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out_kfree;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = new->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out_free_path;
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	/* find out all the old extents for the file range */
+	while (1) {
+		struct btrfs_file_extent_item *extent;
+		struct extent_buffer *l;
+		int slot;
+		u64 num_bytes;
+		u64 offset;
+		u64 end;
+		u64 disk_bytenr;
+		u64 extent_offset;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out_free_path;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid != btrfs_ino(inode))
+			break;
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+		if (key.offset >= new->file_pos + new->len)
+			break;
+
+		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+		num_bytes = btrfs_file_extent_num_bytes(l, extent);
+		if (key.offset + num_bytes < new->file_pos)
+			goto next;
+
+		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+		if (!disk_bytenr)
+			goto next;
+
+		extent_offset = btrfs_file_extent_offset(l, extent);
+
+		old = kmalloc(sizeof(*old), GFP_NOFS);
+		if (!old)
+			goto out_free_path;
+
+		offset = max(new->file_pos, key.offset);
+		end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+		old->bytenr = disk_bytenr;
+		old->extent_offset = extent_offset;
+		old->offset = offset - key.offset;
+		old->len = end - offset;
+		old->new = new;
+		old->count = 0;
+		list_add_tail(&old->list, &new->head);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+
+	btrfs_free_path(path);
+	atomic_inc(&root->fs_info->defrag_running);
+
+	return new;
+
+out_free_path:
+	btrfs_free_path(path);
+out_kfree:
+	free_sa_defrag_extent(new);
+	return NULL;
+}
+
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+					 u64 start, u64 len)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	ASSERT(cache);
+
+	spin_lock(&cache->lock);
+	cache->delalloc_bytes -= len;
+	spin_unlock(&cache->lock);
+
+	btrfs_put_block_group(cache);
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
  */
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 {
+	struct inode *inode = ordered_extent->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans = NULL;
-	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
+	struct new_sa_defrag_extent *new = NULL;
 	int compress_type = 0;
-	int ret;
-	bool nolock = false;
+	int ret = 0;
+	u64 logical_len = ordered_extent->len;
+	bool nolock;
+	bool truncated = false;
 
-	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
-					     end - start + 1);
-	if (!ret)
-		return 0;
-	BUG_ON(!ordered_extent);
+	nolock = btrfs_is_free_space_inode(inode);
 
-	nolock = (root == root->fs_info->tree_root);
+	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+		truncated = true;
+		logical_len = ordered_extent->truncated_len;
+		/* Truncated the entire extent, don't bother adding */
+		if (!logical_len)
+			goto out;
+	}
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
-		BUG_ON(!list_empty(&ordered_extent->list));
-		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-		if (!ret) {
-			if (nolock)
-				trans = btrfs_join_transaction_nolock(root, 1);
-			else
-				trans = btrfs_join_transaction(root, 1);
-			BUG_ON(IS_ERR(trans));
-			btrfs_set_trans_block_group(trans, inode);
-			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-			ret = btrfs_update_inode(trans, root, inode);
-			BUG_ON(ret);
+		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root);
+		else
+			trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
 		}
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+		ret = btrfs_update_inode_fallback(trans, root, inode);
+		if (ret) /* -ENOMEM or corruption */
+			btrfs_abort_transaction(trans, root, ret);
 		goto out;
 	}
 
 	lock_extent_bits(io_tree, ordered_extent->file_offset,
 			 ordered_extent->file_offset + ordered_extent->len - 1,
-			 0, &cached_state, GFP_NOFS);
+			 0, &cached_state);
+
+	ret = test_range_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 1, cached_state);
+	if (ret) {
+		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+		if (0 && last_snapshot >= BTRFS_I(inode)->generation)
+			/* the inode is shared */
+			new = record_old_file_extents(inode, ordered_extent);
+
+		clear_extent_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+	}
 
 	if (nolock)
-		trans = btrfs_join_transaction_nolock(root, 1);
+		trans = btrfs_join_transaction_nolock(root);
 	else
-		trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
+		trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out_unlock;
+	}
+
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -1737,41 +2703,90 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		ret = btrfs_mark_extent_written(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
-						ordered_extent->len);
-		BUG_ON(ret);
+						logical_len);
 	} else {
 		BUG_ON(root == root->fs_info->tree_root);
 		ret = insert_reserved_file_extent(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->start,
 						ordered_extent->disk_len,
-						ordered_extent->len,
-						ordered_extent->len,
+						logical_len, logical_len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
-		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-				   ordered_extent->file_offset,
-				   ordered_extent->len);
-		BUG_ON(ret);
+		if (!ret)
+			btrfs_release_delalloc_bytes(root,
+						     ordered_extent->start,
+						     ordered_extent->disk_len);
+	}
+	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+			   ordered_extent->file_offset, ordered_extent->len,
+			   trans->transid);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
 	}
-	unlock_extent_cached(io_tree, ordered_extent->file_offset,
-			     ordered_extent->file_offset +
-			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
 
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
+	ret = btrfs_update_inode_fallback(trans, root, inode);
+	if (ret) { /* -ENOMEM or corruption */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
+	}
+	ret = 0;
+out_unlock:
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
 out:
-	if (nolock) {
-		if (trans)
-			btrfs_end_transaction_nolock(trans, root);
-	} else {
+	if (root != root->fs_info->tree_root)
 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-		if (trans)
-			btrfs_end_transaction(trans, root);
+	if (trans)
+		btrfs_end_transaction(trans, root);
+
+	if (ret || truncated) {
+		u64 start, end;
+
+		if (truncated)
+			start = ordered_extent->file_offset + logical_len;
+		else
+			start = ordered_extent->file_offset;
+		end = ordered_extent->file_offset + ordered_extent->len - 1;
+		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+
+		/* Drop the cache for the part of the extent we didn't write. */
+		btrfs_drop_extent_cache(inode, start, end, 0);
+
+		/*
+		 * If the ordered extent had an IOERR or something else went
+		 * wrong we need to return the space for this ordered extent
+		 * back to the allocator.  We only free the extent in the
+		 * truncated case if we didn't write out the extent at all.
+		 */
+		if ((ret || !logical_len) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+			btrfs_free_reserved_extent(root, ordered_extent->start,
+						   ordered_extent->disk_len, 1);
+	}
+
+
+	/*
+	 * This needs to be done to make sure anybody waiting knows we are done
+	 * updating everything for this ordered extent.
+	 */
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+
+	/* for snapshot-aware defrag */
+	if (new) {
+		if (ret) {
+			free_sa_defrag_extent(new);
+			atomic_dec(&root->fs_info->defrag_running);
+		} else {
+			relink_file_extents(new);
+		}
 	}
 
 	/* once for us */
@@ -1779,176 +2794,60 @@ out:
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	return 0;
+	return ret;
 }
 
-static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
-				struct extent_state *state, int uptodate)
+static void finish_ordered_fn(struct btrfs_work *work)
 {
-	ClearPagePrivate2(page);
-	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+	struct btrfs_ordered_extent *ordered_extent;
+	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+	btrfs_finish_ordered_io(ordered_extent);
 }
 
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data.  This
- * io_failure_record is used to record state as we go through all the
- * mirrors.  If another mirror has good data, the page is set up to date
- * and things continue.  If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
-	struct page *page;
-	u64 start;
-	u64 len;
-	u64 logical;
-	unsigned long bio_flags;
-	int last_mirror;
-};
-
-static int btrfs_io_failed_hook(struct bio *failed_bio,
-			 struct page *page, u64 start, u64 end,
-			 struct extent_state *state)
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
 {
-	struct io_failure_record *failrec = NULL;
-	u64 private;
-	struct extent_map *em;
 	struct inode *inode = page->mapping->host;
-	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct bio *bio;
-	int num_copies;
-	int ret;
-	int rw;
-	u64 logical;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered_extent = NULL;
+	struct btrfs_workqueue *workers;
 
-	ret = get_state_private(failure_tree, start, &private);
-	if (ret) {
-		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
-		if (!failrec)
-			return -ENOMEM;
-		failrec->start = start;
-		failrec->len = end - start + 1;
-		failrec->last_mirror = 0;
-		failrec->bio_flags = 0;
-
-		read_lock(&em_tree->lock);
-		em = lookup_extent_mapping(em_tree, start, failrec->len);
-		if (em->start > start || em->start + em->len < start) {
-			free_extent_map(em);
-			em = NULL;
-		}
-		read_unlock(&em_tree->lock);
+	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
-		if (!em || IS_ERR(em)) {
-			kfree(failrec);
-			return -EIO;
-		}
-		logical = start - em->start;
-		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			logical = em->block_start;
-			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
-			extent_set_compress_type(&failrec->bio_flags,
-						 em->compress_type);
-		}
-		failrec->logical = logical;
-		free_extent_map(em);
-		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
-				EXTENT_DIRTY, GFP_NOFS);
-		set_state_private(failure_tree, start,
-				 (u64)(unsigned long)failrec);
-	} else {
-		failrec = (struct io_failure_record *)(unsigned long)private;
-	}
-	num_copies = btrfs_num_copies(
-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
-			      failrec->logical, failrec->len);
-	failrec->last_mirror++;
-	if (!state) {
-		spin_lock(&BTRFS_I(inode)->io_tree.lock);
-		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
-						    failrec->start,
-						    EXTENT_LOCKED);
-		if (state && state->start != failrec->start)
-			state = NULL;
-		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
-	}
-	if (!state || failrec->last_mirror > num_copies) {
-		set_state_private(failure_tree, failrec->start, 0);
-		clear_extent_bits(failure_tree, failrec->start,
-				  failrec->start + failrec->len - 1,
-				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
-		kfree(failrec);
-		return -EIO;
-	}
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_private = state;
-	bio->bi_end_io = failed_bio->bi_end_io;
-	bio->bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = failed_bio->bi_bdev;
-	bio->bi_size = 0;
-
-	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & REQ_WRITE)
-		rw = WRITE;
-	else
-		rw = READ;
+	ClearPagePrivate2(page);
+	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					    end - start + 1, uptodate))
+		return 0;
 
-	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror,
-						      failrec->bio_flags, 0);
-	return 0;
-}
+	btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
 
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
-{
-	u64 private;
-	u64 private_failure;
-	struct io_failure_record *failure;
-	int ret;
+	if (btrfs_is_free_space_inode(inode))
+		workers = root->fs_info->endio_freespace_worker;
+	else
+		workers = root->fs_info->endio_write_workers;
+	btrfs_queue_work(workers, &ordered_extent->work);
 
-	private = 0;
-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-					start, &private_failure);
-		if (ret == 0) {
-			failure = (struct io_failure_record *)(unsigned long)
-				   private_failure;
-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start, 0);
-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start,
-					  failure->start + failure->len - 1,
-					  EXTENT_DIRTY | EXTENT_LOCKED,
-					  GFP_NOFS);
-			kfree(failure);
-		}
-	}
 	return 0;
 }
 
 /*
  * when reads are done, we need to check csums to verify the data is correct
- * if there's a match, we allow the bio to finish.  If not, we go through
- * the io_failure_record routines to find good copies
+ * if there's a match, we allow the bio to finish.  If not, the code in
+ * extent_io.c will try to find good copies for us.
  */
-static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
 {
-	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+	size_t offset = start - page_offset(page);
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
-	u64 private = ~(u32)0;
-	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 csum_expected;
 	u32 csum = ~(u32)0;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+	                              DEFAULT_RATELIMIT_BURST);
 
 	if (PageChecked(page)) {
 		ClearPageChecked(page);
@@ -1956,7 +2855,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-		return 0;
+		goto good;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -1965,40 +2864,27 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		return 0;
 	}
 
-	if (state && state->start == start) {
-		private = state->private;
-		ret = 0;
-	} else {
-		ret = get_state_private(io_tree, start, &private);
-	}
-	kaddr = kmap_atomic(page, KM_USER0);
-	if (ret)
-		goto zeroit;
+	phy_offset >>= inode->i_sb->s_blocksize_bits;
+	csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
 
-	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+	kaddr = kmap_atomic(page);
+	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
-	if (csum != private)
+	if (csum != csum_expected)
 		goto zeroit;
 
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 good:
-	/* if the io failure tree for this inode is non-empty,
-	 * check to see if we've recovered from a failed IO
-	 */
-	btrfs_clean_io_failures(inode, start);
 	return 0;
 
 zeroit:
-	if (printk_ratelimit()) {
-		printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
-		       "private %llu\n", page->mapping->host->i_ino,
-		       (unsigned long long)start, csum,
-		       (unsigned long long)private);
-	}
+	if (__ratelimit(&_rs))
+		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+			btrfs_ino(page->mapping->host), start, csum, csum_expected);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-	if (private == 0)
+	kunmap_atomic(kaddr);
+	if (csum_expected == 0)
 		return 0;
 	return -EIO;
 }
@@ -2008,6 +2894,8 @@ struct delayed_iput {
 	struct inode *inode;
 };
 
+/* JDM: If this is fs-wide, why can't we add a pointer to
+ * btrfs_inode instead and avoid the allocation? */
 void btrfs_add_delayed_iput(struct inode *inode)
 {
 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -2037,7 +2925,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	if (empty)
 		return;
 
-	down_read(&root->fs_info->cleanup_work_sem);
 	spin_lock(&fs_info->delayed_iput_lock);
 	list_splice_init(&fs_info->delayed_iputs, &list);
 	spin_unlock(&fs_info->delayed_iput_lock);
@@ -2048,123 +2935,52 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 		iput(delayed->inode);
 		kfree(delayed);
 	}
-	up_read(&root->fs_info->cleanup_work_sem);
 }
 
 /*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
+ * This is called in transaction commit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
  */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending,
-				u64 *bytes_to_reserve)
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root)
 {
-	struct btrfs_root *root;
 	struct btrfs_block_rsv *block_rsv;
-	u64 num_bytes;
-	int index;
+	int ret;
 
-	root = pending->root;
-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+	if (atomic_read(&root->orphan_inodes) ||
+	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
 		return;
 
-	block_rsv = root->orphan_block_rsv;
-
-	/* orphan block reservation for the snapshot */
-	num_bytes = block_rsv->size;
-
-	/*
-	 * after the snapshot is created, COWing tree blocks may use more
-	 * space than it frees. So we should make sure there is enough
-	 * reserved space.
-	 */
-	index = trans->transid & 0x1;
-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-		num_bytes += block_rsv->size -
-			     (block_rsv->reserved + block_rsv->freed[index]);
+	spin_lock(&root->orphan_lock);
+	if (atomic_read(&root->orphan_inodes)) {
+		spin_unlock(&root->orphan_lock);
+		return;
 	}
 
-	*bytes_to_reserve += num_bytes;
-}
-
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending)
-{
-	struct btrfs_root *root = pending->root;
-	struct btrfs_root *snap = pending->snap;
-	struct btrfs_block_rsv *block_rsv;
-	u64 num_bytes;
-	int index;
-	int ret;
-
-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+		spin_unlock(&root->orphan_lock);
 		return;
-
-	/* refill source subvolume's orphan block reservation */
-	block_rsv = root->orphan_block_rsv;
-	index = trans->transid & 0x1;
-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-		num_bytes = block_rsv->size -
-			    (block_rsv->reserved + block_rsv->freed[index]);
-		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-					      root->orphan_block_rsv,
-					      num_bytes);
-		BUG_ON(ret);
 	}
 
-	/* setup orphan block reservation for the snapshot */
-	block_rsv = btrfs_alloc_block_rsv(snap);
-	BUG_ON(!block_rsv);
-
-	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-	snap->orphan_block_rsv = block_rsv;
-
-	num_bytes = root->orphan_block_rsv->size;
-	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-				      block_rsv, num_bytes);
-	BUG_ON(ret);
-
-#if 0
-	/* insert orphan item for the snapshot */
-	WARN_ON(!root->orphan_item_inserted);
-	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
-				       snap->root_key.objectid);
-	BUG_ON(ret);
-	snap->orphan_item_inserted = 1;
-#endif
-}
-
-enum btrfs_orphan_cleanup_state {
-	ORPHAN_CLEANUP_STARTED	= 1,
-	ORPHAN_CLEANUP_DONE	= 2,
-};
-
-/*
- * This is called in transaction commmit time. If there are no orphan
- * files in the subvolume, it removes orphan item and frees block_rsv
- * structure.
- */
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root)
-{
-	int ret;
-
-	if (!list_empty(&root->orphan_list) ||
-	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
-		return;
+	block_rsv = root->orphan_block_rsv;
+	root->orphan_block_rsv = NULL;
+	spin_unlock(&root->orphan_lock);
 
-	if (root->orphan_item_inserted &&
+	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
 	    btrfs_root_refs(&root->root_item) > 0) {
 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
 					    root->root_key.objectid);
-		BUG_ON(ret);
-		root->orphan_item_inserted = 0;
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
+		else
+			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+				  &root->state);
 	}
 
-	if (root->orphan_block_rsv) {
-		WARN_ON(root->orphan_block_rsv->size > 0);
-		btrfs_free_block_rsv(root, root->orphan_block_rsv);
-		root->orphan_block_rsv = NULL;
+	if (block_rsv) {
+		WARN_ON(block_rsv->size > 0);
+		btrfs_free_block_rsv(root, block_rsv);
 	}
 }
 
@@ -2184,8 +3000,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	int ret;
 
 	if (!root->orphan_block_rsv) {
-		block_rsv = btrfs_alloc_block_rsv(root);
-		BUG_ON(!block_rsv);
+		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+		if (!block_rsv)
+			return -ENOMEM;
 	}
 
 	spin_lock(&root->orphan_lock);
@@ -2196,8 +3013,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 		block_rsv = NULL;
 	}
 
-	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			      &BTRFS_I(inode)->runtime_flags)) {
 #if 0
 		/*
 		 * For proper ENOSPC handling, we should do orphan
@@ -2210,36 +3027,48 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 			insert = 1;
 #endif
 		insert = 1;
-	} else {
-		WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
+		atomic_inc(&root->orphan_inodes);
 	}
 
-	if (!BTRFS_I(inode)->orphan_meta_reserved) {
-		BTRFS_I(inode)->orphan_meta_reserved = 1;
+	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+			      &BTRFS_I(inode)->runtime_flags))
 		reserve = 1;
-	}
 	spin_unlock(&root->orphan_lock);
 
-	if (block_rsv)
-		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-
 	/* grab metadata reservation from transaction handle */
 	if (reserve) {
 		ret = btrfs_orphan_reserve_metadata(trans, inode);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
 	}
 
 	/* insert an orphan item to track this unlinked/truncated file */
 	if (insert >= 1) {
-		ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
-		BUG_ON(ret);
+		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
+		if (ret) {
+			atomic_dec(&root->orphan_inodes);
+			if (reserve) {
+				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_orphan_release_metadata(inode);
+			}
+			if (ret != -EEXIST) {
+				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
+		}
+		ret = 0;
 	}
 
 	/* insert an orphan item to track subvolume contains orphan files */
 	if (insert >= 2) {
 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
 					       root->root_key.objectid);
-		BUG_ON(ret);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			return ret;
+		}
 	}
 	return 0;
 }
@@ -2248,7 +3077,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
  * We have done the truncate/delete so we can go ahead and remove the orphan
  * item for this particular inode.
  */
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+			    struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int delete_item = 0;
@@ -2256,46 +3086,50 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 	int ret = 0;
 
 	spin_lock(&root->orphan_lock);
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			       &BTRFS_I(inode)->runtime_flags))
 		delete_item = 1;
-	}
 
-	if (BTRFS_I(inode)->orphan_meta_reserved) {
-		BTRFS_I(inode)->orphan_meta_reserved = 0;
+	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+			       &BTRFS_I(inode)->runtime_flags))
 		release_rsv = 1;
-	}
 	spin_unlock(&root->orphan_lock);
 
-	if (trans && delete_item) {
-		ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
-		BUG_ON(ret);
+	if (delete_item) {
+		atomic_dec(&root->orphan_inodes);
+		if (trans)
+			ret = btrfs_del_orphan_item(trans, root,
+						    btrfs_ino(inode));
 	}
 
 	if (release_rsv)
 		btrfs_orphan_release_metadata(inode);
 
-	return 0;
+	return ret;
 }
 
 /*
  * this cleans up any orphans that may be left on the list from the last use
  * of this root.
  */
-void btrfs_orphan_cleanup(struct btrfs_root *root)
+int btrfs_orphan_cleanup(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key key, found_key;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode;
+	u64 last_objectid = 0;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
-		return;
+		return 0;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	path->reada = -1;
 
 	key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2304,18 +3138,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0) {
-			printk(KERN_ERR "Error searching slot for orphan: %d"
-			       "\n", ret);
-			break;
-		}
+		if (ret < 0)
+			goto out;
 
 		/*
 		 * if ret == 0 means we found what we were searching for, which
-		 * is weird, but possible, so only screw with path if we didnt
+		 * is weird, but possible, so only screw with path if we didn't
 		 * find the key and see if we have stuff that matches
 		 */
 		if (ret > 0) {
+			ret = 0;
 			if (path->slots[0] == 0)
 				break;
 			path->slots[0]--;
@@ -2332,54 +3164,127 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 			break;
 
 		/* release the path since we're done with it */
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 
 		/*
 		 * this is where we are basically btrfs_lookup, without the
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
+
+		if (found_key.offset == last_objectid) {
+			btrfs_err(root->fs_info,
+				"Error removing orphan entry, stopping orphan cleanup");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		last_objectid = found_key.offset;
+
 		found_key.objectid = found_key.offset;
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-		BUG_ON(IS_ERR(inode));
+		ret = PTR_ERR_OR_ZERO(inode);
+		if (ret && ret != -ESTALE)
+			goto out;
 
-		/*
-		 * add this inode to the orphan list so btrfs_orphan_del does
-		 * the proper thing when we hit it
-		 */
-		spin_lock(&root->orphan_lock);
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
+		if (ret == -ESTALE && root == root->fs_info->tree_root) {
+			struct btrfs_root *dead_root;
+			struct btrfs_fs_info *fs_info = root->fs_info;
+			int is_dead_root = 0;
 
+			/*
+			 * this is an orphan in the tree root. Currently these
+			 * could come from 2 sources:
+			 *  a) a snapshot deletion in progress
+			 *  b) a free space cache inode
+			 * We need to distinguish those two, as the snapshot
+			 * orphan must not get deleted.
+			 * find_dead_roots already ran before us, so if this
+			 * is a snapshot deletion, we should find the root
+			 * in the dead_roots list
+			 */
+			spin_lock(&fs_info->trans_lock);
+			list_for_each_entry(dead_root, &fs_info->dead_roots,
+					    root_list) {
+				if (dead_root->root_key.objectid ==
+				    found_key.objectid) {
+					is_dead_root = 1;
+					break;
+				}
+			}
+			spin_unlock(&fs_info->trans_lock);
+			if (is_dead_root) {
+				/* prevent this orphan from being found again */
+				key.offset = found_key.objectid - 1;
+				continue;
+			}
+		}
 		/*
-		 * if this is a bad inode, means we actually succeeded in
-		 * removing the inode, but not the orphan record, which means
-		 * we need to manually delete the orphan since iput will just
-		 * do a destroy_inode
+		 * Inode is already gone but the orphan item is still there,
+		 * kill the orphan item.
 		 */
-		if (is_bad_inode(inode)) {
-			trans = btrfs_start_transaction(root, 0);
-			BUG_ON(IS_ERR(trans));
-			btrfs_orphan_del(trans, inode);
+		if (ret == -ESTALE) {
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			btrfs_debug(root->fs_info, "auto deleting %Lu",
+				found_key.objectid);
+			ret = btrfs_del_orphan_item(trans, root,
+						    found_key.objectid);
 			btrfs_end_transaction(trans, root);
-			iput(inode);
+			if (ret)
+				goto out;
 			continue;
 		}
 
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			&BTRFS_I(inode)->runtime_flags);
+		atomic_inc(&root->orphan_inodes);
+
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
+			if (WARN_ON(!S_ISREG(inode->i_mode))) {
+				iput(inode);
+				continue;
+			}
 			nr_truncate++;
-			btrfs_truncate(inode);
+
+			/* 1 for the orphan item deletion. */
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				iput(inode);
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			ret = btrfs_orphan_add(trans, inode);
+			btrfs_end_transaction(trans, root);
+			if (ret) {
+				iput(inode);
+				goto out;
+			}
+
+			ret = btrfs_truncate(inode);
+			if (ret)
+				btrfs_orphan_del(NULL, inode);
 		} else {
 			nr_unlink++;
 		}
 
 		/* this will do delete_inode and everything for us */
 		iput(inode);
+		if (ret)
+			goto out;
 	}
-	btrfs_free_path(path);
+	/* release the path since we're done with it */
+	btrfs_release_path(path);
 
 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
 
@@ -2387,16 +3292,24 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		btrfs_block_rsv_release(root, root->orphan_block_rsv,
 					(u64)-1);
 
-	if (root->orphan_block_rsv || root->orphan_item_inserted) {
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		btrfs_end_transaction(trans, root);
+	if (root->orphan_block_rsv ||
+	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+		trans = btrfs_join_transaction(root);
+		if (!IS_ERR(trans))
+			btrfs_end_transaction(trans, root);
 	}
 
 	if (nr_unlink)
-		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
 	if (nr_truncate)
-		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
+
+out:
+	if (ret)
+		btrfs_crit(root->fs_info,
+			"could not do orphan cleanup %d", ret);
+	btrfs_free_path(path);
+	return ret;
 }
 
 /*
@@ -2406,13 +3319,24 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
  * slot is the slot the inode is in, objectid is the objectid of the inode
  */
 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
-					  int slot, u64 objectid)
+					  int slot, u64 objectid,
+					  int *first_xattr_slot)
 {
 	u32 nritems = btrfs_header_nritems(leaf);
 	struct btrfs_key found_key;
+	static u64 xattr_access = 0;
+	static u64 xattr_default = 0;
 	int scanned = 0;
 
+	if (!xattr_access) {
+		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+					strlen(POSIX_ACL_XATTR_ACCESS));
+		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+					strlen(POSIX_ACL_XATTR_DEFAULT));
+	}
+
 	slot++;
+	*first_xattr_slot = -1;
 	while (slot < nritems) {
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
@@ -2421,8 +3345,13 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 			return 0;
 
 		/* we found an xattr, assume we've got an acl */
-		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
-			return 1;
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (*first_xattr_slot == -1)
+				*first_xattr_slot = slot;
+			if (found_key.offset == xattr_access ||
+			    found_key.offset == xattr_default)
+				return 1;
+		}
 
 		/*
 		 * we found a key greater than an xattr key, there can't
@@ -2447,6 +3376,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	 * something larger than an xattr.  We have to assume the inode
 	 * has acls
 	 */
+	if (*first_xattr_slot == -1)
+		*first_xattr_slot = slot;
 	return 1;
 }
 
@@ -2461,13 +3392,21 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	unsigned long ptr;
 	int maybe_acls;
-	u64 alloc_group_block;
 	u32 rdev;
 	int ret;
+	bool filled = false;
+	int first_xattr_slot;
+
+	ret = btrfs_fill_inode(inode, &rdev);
+	if (!ret)
+		filled = true;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		goto make_bad;
+
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2475,13 +3414,16 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		goto make_bad;
 
 	leaf = path->nodes[0];
+
+	if (filled)
+		goto cache_index;
+
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
-
 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
-	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
-	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
-	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
+	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
+	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
 
 	tspec = btrfs_inode_atime(inode_item);
@@ -2498,7 +3440,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
-	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+	/*
+	 * If we were modified in the current generation and evicted from memory
+	 * and then re-read we need to do a full sync since we don't have any
+	 * idea about which extents were modified before we were evicted from
+	 * cache.
+	 */
+	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
+	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2506,21 +3460,50 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
-	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+cache_index:
+	path->slots[0]++;
+	if (inode->i_nlink != 1 ||
+	    path->slots[0] >= btrfs_header_nritems(leaf))
+		goto cache_acl;
+
+	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+	if (location.objectid != btrfs_ino(inode))
+		goto cache_acl;
+
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	if (location.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *ref;
+
+		ref = (struct btrfs_inode_ref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *extref;
 
+		extref = (struct btrfs_inode_extref *)ptr;
+		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+								     extref);
+	}
+cache_acl:
 	/*
 	 * try to precache a NULL acl entry for files that don't have
 	 * any xattrs or acls
 	 */
-	maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+					   btrfs_ino(inode), &first_xattr_slot);
+	if (first_xattr_slot != -1) {
+		path->slots[0] = first_xattr_slot;
+		ret = btrfs_load_inode_props(inode, path);
+		if (ret)
+			btrfs_err(root->fs_info,
+				  "error loading props for ino %llu (root %llu): %d",
+				  btrfs_ino(inode),
+				  root->root_key.objectid, ret);
+	}
+	btrfs_free_path(path);
+
 	if (!maybe_acls)
 		cache_no_acl(inode);
 
-	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
-						alloc_group_block, 0);
-	btrfs_free_path(path);
-	inode_item = NULL;
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
@@ -2563,40 +3546,47 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_inode_item *item,
 			    struct inode *inode)
 {
-	btrfs_set_inode_uid(leaf, item, inode->i_uid);
-	btrfs_set_inode_gid(leaf, item, inode->i_gid);
-	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
-	btrfs_set_inode_mode(leaf, item, inode->i_mode);
-	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+				   &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
 
-	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-			       inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-				inode->i_atime.tv_nsec);
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+				      inode->i_atime.tv_nsec, &token);
 
-	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-			       inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-				inode->i_mtime.tv_nsec);
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				      inode->i_mtime.tv_nsec, &token);
 
-	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-			       inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-				inode->i_ctime.tv_nsec);
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				      inode->i_ctime.tv_nsec, &token);
 
-	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
-	btrfs_set_inode_transid(leaf, item, trans->transid);
-	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+					 &token);
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
 }
 
 /*
  * copy everything in the in-memory inode into the btree.
  */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
@@ -2605,20 +3595,21 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	int ret;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	path->leave_spinning = 1;
-	ret = btrfs_lookup_inode(trans, root, path,
-				 &BTRFS_I(inode)->location, 1);
+	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
+				 1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
 		goto failed;
 	}
 
-	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
-				  struct btrfs_inode_item);
+				    struct btrfs_inode_item);
 
 	fill_inode_item(trans, leaf, inode_item, inode);
 	btrfs_mark_buffer_dirty(leaf);
@@ -2629,16 +3620,55 @@ failed:
 	return ret;
 }
 
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	int ret;
+
+	/*
+	 * If the inode is a free space inode, we can deadlock during commit
+	 * if we put it into the delayed code.
+	 *
+	 * The data relocation inode should also be directly updated
+	 * without delay
+	 */
+	if (!btrfs_is_free_space_inode(inode)
+	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+		btrfs_update_root_times(trans, root);
+
+		ret = btrfs_delayed_update_inode(trans, root, inode);
+		if (!ret)
+			btrfs_set_inode_last_trans(trans, inode);
+		return ret;
+	}
+
+	return btrfs_update_inode_item(trans, root, inode);
+}
+
+noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct inode *inode)
+{
+	int ret;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret == -ENOSPC)
+		return btrfs_update_inode_item(trans, root, inode);
+	return ret;
+}
 
 /*
  * unlink helper that gets used here in inode.c and in the tree logging
  * recovery code.  It remove a link in a directory with a given name, and
  * also drops the back refs in the inode to the directory
  */
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       struct inode *dir, struct inode *inode,
-		       const char *name, int name_len)
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *dir, struct inode *inode,
+				const char *name, int name_len)
 {
 	struct btrfs_path *path;
 	int ret = 0;
@@ -2646,6 +3676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
 	u64 index;
+	u64 ino = btrfs_ino(inode);
+	u64 dir_ino = btrfs_ino(dir);
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -2654,7 +3686,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	}
 
 	path->leave_spinning = 1;
-	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
@@ -2669,266 +3701,154 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 	if (ret)
 		goto err;
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-	ret = btrfs_del_inode_ref(trans, root, name, name_len,
-				  inode->i_ino,
-				  dir->i_ino, &index);
-	if (ret) {
-		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
-		       "inode %lu parent %lu\n", name_len, name,
-		       inode->i_ino, dir->i_ino);
-		goto err;
+	/*
+	 * If we don't have dir index, we have to get it by looking up
+	 * the inode ref, since we get the inode ref, remove it directly,
+	 * it is unnecessary to do delayed deletion.
+	 *
+	 * But if we have dir index, needn't search inode ref to get it.
+	 * Since the inode ref is close to the inode item, it is better
+	 * that we delay to delete it, and just do this deletion when
+	 * we update the inode item.
+	 */
+	if (BTRFS_I(inode)->dir_index) {
+		ret = btrfs_delayed_delete_inode_ref(inode);
+		if (!ret) {
+			index = BTRFS_I(inode)->dir_index;
+			goto skip_backref;
+		}
 	}
 
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 index, name, name_len, -1);
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
+	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
+				  dir_ino, &index);
+	if (ret) {
+		btrfs_info(root->fs_info,
+			"failed to delete reference to %.*s, inode %llu parent %llu",
+			name_len, name, ino, dir_ino);
+		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
-	if (!di) {
-		ret = -ENOENT;
+skip_backref:
+	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
-	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	btrfs_release_path(root, path);
 
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
-					 inode, dir->i_ino);
-	BUG_ON(ret != 0 && ret != -ENOENT);
+					 inode, dir_ino);
+	if (ret != 0 && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto err;
+	}
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
 	if (ret == -ENOENT)
 		ret = 0;
+	else if (ret)
+		btrfs_abort_transaction(trans, root, ret);
 err:
 	btrfs_free_path(path);
 	if (ret)
 		goto out;
 
 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode_inc_iversion(inode);
+	inode_inc_iversion(dir);
 	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	btrfs_update_inode(trans, root, dir);
-	btrfs_drop_nlink(inode);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, root, dir);
 out:
 	return ret;
 }
 
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
-			     struct btrfs_path *path)
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len)
 {
-	struct extent_buffer *eb;
-	int level;
-	u64 refs = 1;
-
-	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-		int ret;
-
-		if (!path->nodes[level])
-			break;
-		eb = path->nodes[level];
-		if (!btrfs_block_can_be_shared(root, eb))
-			continue;
-		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
-					       &refs, NULL);
-		if (refs > 1)
-			return 1;
+	int ret;
+	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	if (!ret) {
+		drop_nlink(inode);
+		ret = btrfs_update_inode(trans, root, inode);
 	}
-	return 0;
+	return ret;
 }
 
 /*
  * helper to start transaction for unlink and rmdir.
  *
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
  */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
-						       struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_path *path;
-	struct btrfs_inode_ref *ref;
-	struct btrfs_dir_item *di;
-	struct inode *inode = dentry->d_inode;
-	u64 index;
-	int check_link = 1;
-	int err = -ENOSPC;
 	int ret;
 
-	trans = btrfs_start_transaction(root, 10);
+	/*
+	 * 1 for the possible orphan item
+	 * 1 for the dir item
+	 * 1 for the dir index
+	 * 1 for the inode ref
+	 * 1 for the inode
+	 */
+	trans = btrfs_start_transaction(root, 5);
 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
 		return trans;
 
-	if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-		return ERR_PTR(-ENOSPC);
-
-	/* check if there is someone else holds reference */
-	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-		return ERR_PTR(-ENOSPC);
+	if (PTR_ERR(trans) == -ENOSPC) {
+		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
 
-	if (atomic_read(&inode->i_count) > 2)
-		return ERR_PTR(-ENOSPC);
-
-	if (xchg(&root->fs_info->enospc_unlink, 1))
-		return ERR_PTR(-ENOSPC);
-
-	path = btrfs_alloc_path();
-	if (!path) {
-		root->fs_info->enospc_unlink = 0;
-		return ERR_PTR(-ENOMEM);
-	}
-
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		root->fs_info->enospc_unlink = 0;
-		return trans;
-	}
-
-	path->skip_locking = 1;
-	path->search_commit_root = 1;
-
-	ret = btrfs_lookup_inode(trans, root, path,
-				&BTRFS_I(dir)->location, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		check_link = 0;
-	}
-	btrfs_release_path(root, path);
-
-	ret = btrfs_lookup_inode(trans, root, path,
-				&BTRFS_I(inode)->location, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		check_link = 0;
-	}
-	btrfs_release_path(root, path);
-
-	if (ret == 0 && S_ISREG(inode->i_mode)) {
-		ret = btrfs_lookup_file_extent(trans, root, path,
-					       inode->i_ino, (u64)-1, 0);
-		if (ret < 0) {
-			err = ret;
-			goto out;
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return trans;
+		ret = btrfs_cond_migrate_bytes(root->fs_info,
+					       &root->fs_info->trans_block_rsv,
+					       num_bytes, 5);
+		if (ret) {
+			btrfs_end_transaction(trans, root);
+			return ERR_PTR(ret);
 		}
-		BUG_ON(ret == 0);
-		if (check_path_shared(root, path))
-			goto out;
-		btrfs_release_path(root, path);
-	}
-
-	if (!check_link) {
-		err = 0;
-		goto out;
-	}
-
-	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
-				dentry->d_name.name, dentry->d_name.len, 0);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto out;
-	}
-	if (di) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		err = 0;
-		goto out;
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		trans->bytes_reserved = num_bytes;
 	}
-	btrfs_release_path(root, path);
-
-	ref = btrfs_lookup_inode_ref(trans, root, path,
-				dentry->d_name.name, dentry->d_name.len,
-				inode->i_ino, dir->i_ino, 0);
-	if (IS_ERR(ref)) {
-		err = PTR_ERR(ref);
-		goto out;
-	}
-	BUG_ON(!ref);
-	if (check_path_shared(root, path))
-		goto out;
-	index = btrfs_inode_ref_index(path->nodes[0], ref);
-	btrfs_release_path(root, path);
-
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
-				dentry->d_name.name, dentry->d_name.len, 0);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto out;
-	}
-	BUG_ON(ret == -ENOENT);
-	if (check_path_shared(root, path))
-		goto out;
-
-	err = 0;
-out:
-	btrfs_free_path(path);
-	if (err) {
-		btrfs_end_transaction(trans, root);
-		root->fs_info->enospc_unlink = 0;
-		return ERR_PTR(err);
-	}
-
-	trans->block_rsv = &root->fs_info->global_block_rsv;
 	return trans;
 }
 
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root)
-{
-	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
-		BUG_ON(!root->fs_info->enospc_unlink);
-		root->fs_info->enospc_unlink = 0;
-	}
-	btrfs_end_transaction_throttle(trans, root);
-}
-
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode = dentry->d_inode;
 	int ret;
-	unsigned long nr = 0;
 
-	trans = __unlink_start_trans(dir, dentry);
+	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
 
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	if (inode->i_nlink == 0) {
 		ret = btrfs_orphan_add(trans, inode);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 	}
 
-	nr = trans->blocks_used;
-	__unlink_end_trans(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+out:
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
 	return ret;
 }
 
@@ -2943,55 +3863,73 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	u64 index;
 	int ret;
+	u64 dir_ino = btrfs_ino(dir);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 				   name, name_len, -1);
-	BUG_ON(!di || IS_ERR(di));
+	if (IS_ERR_OR_NULL(di)) {
+		if (!di)
+			ret = -ENOENT;
+		else
+			ret = PTR_ERR(di);
+		goto out;
+	}
 
 	leaf = path->nodes[0];
 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+	btrfs_release_path(path);
 
 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
 				 objectid, root->root_key.objectid,
-				 dir->i_ino, &index, name, name_len);
+				 dir_ino, &index, name, name_len);
 	if (ret < 0) {
-		BUG_ON(ret != -ENOENT);
-		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+		if (ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+		di = btrfs_search_dir_index_item(root, path, dir_ino,
 						 name, name_len);
-		BUG_ON(!di || IS_ERR(di));
+		if (IS_ERR_OR_NULL(di)) {
+			if (!di)
+				ret = -ENOENT;
+			else
+				ret = PTR_ERR(di);
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		index = key.offset;
 	}
+	btrfs_release_path(path);
 
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 index, name, name_len, -1);
-	BUG_ON(!di || IS_ERR(di));
-
-	leaf = path->nodes[0];
-	btrfs_dir_item_key_to_cpu(leaf, di, &key);
-	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
-	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
+	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode_inc_iversion(dir);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	ret = btrfs_update_inode(trans, root, dir);
-	BUG_ON(ret);
-
+	ret = btrfs_update_inode_fallback(trans, root, dir);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+out:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -3000,19 +3938,17 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err = 0;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+		return -EPERM;
 
-	trans = __unlink_start_trans(dir, dentry);
+	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
-	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 		err = btrfs_unlink_subvol(trans, root, dir,
 					  BTRFS_I(inode)->location.objectid,
 					  dentry->d_name.name,
@@ -3030,185 +3966,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!err)
 		btrfs_i_size_write(inode, 0);
 out:
-	nr = trans->blocks_used;
-	__unlink_end_trans(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
 
 	return err;
 }
 
-#if 0
-/*
- * when truncating bytes in a file, it is possible to avoid reading
- * the leaves that contain only checksum items.  This can be the
- * majority of the IO required to delete a large file, but it must
- * be done carefully.
- *
- * The keys in the level just above the leaves are checked to make sure
- * the lowest key in a given leaf is a csum key, and starts at an offset
- * after the new  size.
- *
- * Then the key for the next leaf is checked to make sure it also has
- * a checksum item for the same file.  If it does, we know our target leaf
- * contains only checksum items, and it can be safely freed without reading
- * it.
- *
- * This is just an optimization targeted at large files.  It may do
- * nothing.  It will return 0 unless things went badly.
- */
-static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     struct btrfs_path *path,
-				     struct inode *inode, u64 new_size)
-{
-	struct btrfs_key key;
-	int ret;
-	int nritems;
-	struct btrfs_key found_key;
-	struct btrfs_key other_key;
-	struct btrfs_leaf_ref *ref;
-	u64 leaf_gen;
-	u64 leaf_start;
-
-	path->lowest_level = 1;
-	key.objectid = inode->i_ino;
-	key.type = BTRFS_CSUM_ITEM_KEY;
-	key.offset = new_size;
-again:
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto out;
-
-	if (path->nodes[1] == NULL) {
-		ret = 0;
-		goto out;
-	}
-	ret = 0;
-	btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
-	nritems = btrfs_header_nritems(path->nodes[1]);
-
-	if (!nritems)
-		goto out;
-
-	if (path->slots[1] >= nritems)
-		goto next_node;
-
-	/* did we find a key greater than anything we want to delete? */
-	if (found_key.objectid > inode->i_ino ||
-	   (found_key.objectid == inode->i_ino && found_key.type > key.type))
-		goto out;
-
-	/* we check the next key in the node to make sure the leave contains
-	 * only checksum items.  This comparison doesn't work if our
-	 * leaf is the last one in the node
-	 */
-	if (path->slots[1] + 1 >= nritems) {
-next_node:
-		/* search forward from the last key in the node, this
-		 * will bring us into the next node in the tree
-		 */
-		btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
-
-		/* unlikely, but we inc below, so check to be safe */
-		if (found_key.offset == (u64)-1)
-			goto out;
-
-		/* search_forward needs a path with locks held, do the
-		 * search again for the original key.  It is possible
-		 * this will race with a balance and return a path that
-		 * we could modify, but this drop is just an optimization
-		 * and is allowed to miss some leaves.
-		 */
-		btrfs_release_path(root, path);
-		found_key.offset++;
-
-		/* setup a max key for search_forward */
-		other_key.offset = (u64)-1;
-		other_key.type = key.type;
-		other_key.objectid = key.objectid;
-
-		path->keep_locks = 1;
-		ret = btrfs_search_forward(root, &found_key, &other_key,
-					   path, 0, 0);
-		path->keep_locks = 0;
-		if (ret || found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
-			ret = 0;
-			goto out;
-		}
-
-		key.offset = found_key.offset;
-		btrfs_release_path(root, path);
-		cond_resched();
-		goto again;
-	}
-
-	/* we know there's one more slot after us in the tree,
-	 * read that key so we can verify it is also a checksum item
-	 */
-	btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
-
-	if (found_key.objectid < inode->i_ino)
-		goto next_key;
-
-	if (found_key.type != key.type || found_key.offset < new_size)
-		goto next_key;
-
-	/*
-	 * if the key for the next leaf isn't a csum key from this objectid,
-	 * we can't be sure there aren't good items inside this leaf.
-	 * Bail out
-	 */
-	if (other_key.objectid != inode->i_ino || other_key.type != key.type)
-		goto out;
-
-	leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
-	leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
-	/*
-	 * it is safe to delete this leaf, it contains only
-	 * csum items from this inode at an offset >= new_size
-	 */
-	ret = btrfs_del_leaf(trans, root, path, leaf_start);
-	BUG_ON(ret);
-
-	if (root->ref_cows && leaf_gen < trans->transid) {
-		ref = btrfs_alloc_leaf_ref(root, 0);
-		if (ref) {
-			ref->root_gen = root->root_key.offset;
-			ref->bytenr = leaf_start;
-			ref->owner = 0;
-			ref->generation = leaf_gen;
-			ref->nritems = 0;
-
-			btrfs_sort_leaf_ref(ref);
-
-			ret = btrfs_add_leaf_ref(root, ref, 0);
-			WARN_ON(ret);
-			btrfs_free_leaf_ref(root, ref);
-		} else {
-			WARN_ON(1);
-		}
-	}
-next_key:
-	btrfs_release_path(root, path);
-
-	if (other_key.objectid == inode->i_ino &&
-	    other_key.type == key.type && other_key.offset > key.offset) {
-		key.offset = other_key.offset;
-		cond_resched();
-		goto again;
-	}
-	ret = 0;
-out:
-	/* fixup any changes we've made to the path */
-	path->lowest_level = 0;
-	path->keep_locks = 0;
-	btrfs_release_path(root, path);
-	return ret;
-}
-
-#endif
-
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -3234,27 +3997,44 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 mask = root->sectorsize - 1;
+	u64 last_size = (u64)-1;
 	u32 found_type = (u8)-1;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
-	int encoding;
 	int ret;
 	int err = 0;
+	u64 ino = btrfs_ino(inode);
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
-	if (root->ref_cows || root == root->fs_info->tree_root)
-		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
-
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 	path->reada = -1;
 
-	key.objectid = inode->i_ino;
+	/*
+	 * We want to drop from the next block forward in case this new size is
+	 * not block aligned since we will be keeping the last block of the
+	 * extent just the way it is.
+	 */
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    root == root->fs_info->tree_root)
+		btrfs_drop_extent_cache(inode, ALIGN(new_size,
+					root->sectorsize), (u64)-1, 0);
+
+	/*
+	 * This function is also used to drop the items in the log tree before
+	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+	 * it is used to drop the loged items. So we shouldn't kill the delayed
+	 * items.
+	 */
+	if (min_type == 0 && root == BTRFS_I(inode)->root)
+		btrfs_kill_delayed_inode_items(inode);
+
+	key.objectid = ino;
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
@@ -3280,9 +4060,8 @@ search_again:
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
-		encoding = 0;
 
-		if (found_key.objectid != inode->i_ino)
+		if (found_key.objectid != ino)
 			break;
 
 		if (found_type < min_type)
@@ -3293,16 +4072,12 @@ search_again:
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
-			encoding = btrfs_file_extent_compression(leaf, fi);
-			encoding |= btrfs_file_extent_encryption(leaf, fi);
-			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				item_end += btrfs_file_extent_inline_len(leaf,
-									 fi);
+							 path->slots[0], fi);
 			}
 			item_end--;
 		}
@@ -3321,21 +4096,27 @@ search_again:
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
 
+		if (del_item)
+			last_size = found_key.offset;
+		else
+			last_size = new_size;
+
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item && !encoding) {
+			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
-				extent_num_bytes = new_size -
-					found_key.offset + root->sectorsize - 1;
-				extent_num_bytes = extent_num_bytes &
-					~((u64)root->sectorsize - 1);
+				extent_num_bytes = ALIGN(new_size -
+						found_key.offset,
+						root->sectorsize);
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
 					   extent_num_bytes);
-				if (root->ref_cows && extent_start != 0)
+				if (test_bit(BTRFS_ROOT_REF_COWS,
+					     &root->state) &&
+				    extent_start != 0)
 					inode_sub_bytes(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
@@ -3349,7 +4130,8 @@ search_again:
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
 					found_extent = 1;
-					if (root->ref_cows)
+					if (test_bit(BTRFS_ROOT_REF_COWS,
+						     &root->state))
 						inode_sub_bytes(inode, num_dec);
 				}
 			}
@@ -3364,16 +4146,20 @@ search_again:
 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
 				u32 size = new_size - found_key.offset;
 
-				if (root->ref_cows) {
+				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 					inode_sub_bytes(inode, item_end + 1 -
 							new_size);
-				}
+
+				/*
+				 * update the ram bytes to properly reflect
+				 * the new size of our item
+				 */
+				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
-				ret = btrfs_truncate_item(trans, root, path,
-							  size, 1);
-				BUG_ON(ret);
-			} else if (root->ref_cows) {
+				btrfs_truncate_item(root, path, size, 1);
+			} else if (test_bit(BTRFS_ROOT_REF_COWS,
+					    &root->state)) {
 				inode_sub_bytes(inode, item_end + 1 -
 						found_key.offset);
 			}
@@ -3395,13 +4181,14 @@ delete:
 		} else {
 			break;
 		}
-		if (found_extent && (root->ref_cows ||
-				     root == root->fs_info->tree_root)) {
+		if (found_extent &&
+		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+		     root == root->fs_info->tree_root)) {
 			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						inode->i_ino, extent_offset);
+						ino, extent_offset, 0);
 			BUG_ON(ret);
 		}
 
@@ -3410,18 +4197,18 @@ delete:
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot) {
-			if (root->ref_cows) {
-				err = -EAGAIN;
-				goto out;
-			}
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
 						pending_del_nr);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_abort_transaction(trans,
+								root, ret);
+					goto error;
+				}
 				pending_del_nr = 0;
 			}
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto search_again;
 		} else {
 			path->slots[0]--;
@@ -3431,19 +4218,31 @@ out:
 	if (pending_del_nr) {
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
-		BUG_ON(ret);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
 	}
+error:
+	if (last_size != (u64)-1)
+		btrfs_ordered_update_i_size(inode, last_size, NULL);
 	btrfs_free_path(path);
 	return err;
 }
 
 /*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *	offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
  */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front)
 {
-	struct inode *inode = mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -3453,21 +4252,23 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(mapping);
 	int ret = 0;
 	u64 page_start;
 	u64 page_end;
 
-	if ((offset & (blocksize - 1)) == 0)
+	if ((offset & (blocksize - 1)) == 0 &&
+	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
-	ret = -ENOMEM;
 again:
-	page = grab_cache_page(mapping, index);
+	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -3489,8 +4290,7 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
 	set_page_extent_mapped(page);
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -3505,7 +4305,8 @@ again:
 	}
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3516,10 +4317,14 @@ again:
 		goto out_unlock;
 	}
 
-	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
+		if (!len)
+			len = PAGE_CACHE_SIZE - offset;
 		kaddr = kmap(page);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		if (front)
+			memset(kaddr, 0, offset);
+		else
+			memset(kaddr + offset, 0, len);
 		flush_dcache_page(page);
 		kunmap(page);
 	}
@@ -3537,35 +4342,93 @@ out:
 	return ret;
 }
 
-int btrfs_cont_expand(struct inode *inode, loff_t size)
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+			     u64 offset, u64 len)
 {
 	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/*
+	 * Still need to make sure the inode looks like it's been updated so
+	 * that any holes get logged if we fsync.
+	 */
+	if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+		BTRFS_I(inode)->last_trans = root->fs_info->generation;
+		BTRFS_I(inode)->last_sub_trans = root->log_transid;
+		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+		return 0;
+	}
+
+	/*
+	 * 1 - for the one we're dropping
+	 * 1 - for the one we're adding
+	 * 1 - for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, len, 0, len, 0, 0, 0);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	else
+		btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for.  So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+{
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
-	u64 mask = root->sectorsize - 1;
-	u64 hole_start = (inode->i_size + mask) & ~mask;
-	u64 block_end = (size + mask) & ~mask;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 hole_start = ALIGN(oldsize, root->sectorsize);
+	u64 block_end = ALIGN(size, root->sectorsize);
 	u64 last_byte;
 	u64 cur_offset;
 	u64 hole_size;
 	int err = 0;
 
+	/*
+	 * If our size started in the middle of a page we need to zero out the
+	 * rest of the page before we expand the i_size, otherwise we could
+	 * expose stale data.
+	 */
+	err = btrfs_truncate_page(inode, oldsize, 0, 0);
+	if (err)
+		return err;
+
 	if (size <= hole_start)
 		return 0;
 
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		btrfs_wait_ordered_range(inode, hole_start,
-					 block_end - hole_start);
+
 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
-				 &cached_state, GFP_NOFS);
-		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+				 &cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, hole_start,
+						     block_end - hole_start);
 		if (!ordered)
 			break;
 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
 				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
 
@@ -3573,124 +4436,168 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	while (1) {
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				block_end - cur_offset, 0);
-		BUG_ON(IS_ERR(em) || !em);
+		if (IS_ERR(em)) {
+			err = PTR_ERR(em);
+			em = NULL;
+			break;
+		}
 		last_byte = min(extent_map_end(em), block_end);
-		last_byte = (last_byte + mask) & ~mask;
+		last_byte = ALIGN(last_byte , root->sectorsize);
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
-			u64 hint_byte = 0;
+			struct extent_map *hole_em;
 			hole_size = last_byte - cur_offset;
 
-			trans = btrfs_start_transaction(root, 2);
-			if (IS_ERR(trans)) {
-				err = PTR_ERR(trans);
+			err = maybe_insert_hole(root, inode, cur_offset,
+						hole_size);
+			if (err)
 				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + hole_size - 1, 0);
+			hole_em = alloc_extent_map();
+			if (!hole_em) {
+				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+					&BTRFS_I(inode)->runtime_flags);
+				goto next;
 			}
-			btrfs_set_trans_block_group(trans, inode);
+			hole_em->start = cur_offset;
+			hole_em->len = hole_size;
+			hole_em->orig_start = cur_offset;
+
+			hole_em->block_start = EXTENT_MAP_HOLE;
+			hole_em->block_len = 0;
+			hole_em->orig_block_len = 0;
+			hole_em->ram_bytes = hole_size;
+			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+			hole_em->compress_type = BTRFS_COMPRESS_NONE;
+			hole_em->generation = root->fs_info->generation;
 
-			err = btrfs_drop_extents(trans, inode, cur_offset,
-						 cur_offset + hole_size,
-						 &hint_byte, 1);
-			BUG_ON(err);
-
-			err = btrfs_insert_file_extent(trans, root,
-					inode->i_ino, cur_offset, 0,
-					0, hole_size, 0, hole_size,
-					0, 0, 0);
-			BUG_ON(err);
-
-			btrfs_drop_extent_cache(inode, hole_start,
-					last_byte - 1, 0);
-
-			btrfs_end_transaction(trans, root);
+			while (1) {
+				write_lock(&em_tree->lock);
+				err = add_extent_mapping(em_tree, hole_em, 1);
+				write_unlock(&em_tree->lock);
+				if (err != -EEXIST)
+					break;
+				btrfs_drop_extent_cache(inode, cur_offset,
+							cur_offset +
+							hole_size - 1, 0);
+			}
+			free_extent_map(hole_em);
 		}
+next:
 		free_extent_map(em);
 		em = NULL;
 		cur_offset = last_byte;
 		if (cur_offset >= block_end)
 			break;
 	}
-
 	free_extent_map(em);
 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
 			     GFP_NOFS);
 	return err;
 }
 
-static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
+	loff_t oldsize = i_size_read(inode);
+	loff_t newsize = attr->ia_size;
+	int mask = attr->ia_valid;
 	int ret;
 
-	if (attr->ia_size == inode->i_size)
-		return 0;
-
-	if (attr->ia_size > inode->i_size) {
-		unsigned long limit;
-		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-		if (attr->ia_size > inode->i_sb->s_maxbytes)
-			return -EFBIG;
-		if (limit != RLIM_INFINITY && attr->ia_size > limit) {
-			send_sig(SIGXFSZ, current, 0);
-			return -EFBIG;
-		}
+	/*
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (newsize != oldsize) {
+		inode_inc_iversion(inode);
+		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+			inode->i_ctime = inode->i_mtime =
+				current_fs_time(inode->i_sb);
 	}
 
-	trans = btrfs_start_transaction(root, 5);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	if (newsize > oldsize) {
+		truncate_pagecache(inode, newsize);
+		ret = btrfs_cont_expand(inode, oldsize, newsize);
+		if (ret)
+			return ret;
 
-	btrfs_set_trans_block_group(trans, inode);
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
 
-	ret = btrfs_orphan_add(trans, inode);
-	BUG_ON(ret);
+		i_size_write(inode, newsize);
+		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+		ret = btrfs_update_inode(trans, root, inode);
+		btrfs_end_transaction(trans, root);
+	} else {
 
-	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+		/*
+		 * We're truncating a file that used to have good data down to
+		 * zero. Make sure it gets into the ordered flush list so that
+		 * any new writes get down to disk quickly.
+		 */
+		if (newsize == 0)
+			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+				&BTRFS_I(inode)->runtime_flags);
 
-	if (attr->ia_size > inode->i_size) {
-		ret = btrfs_cont_expand(inode, attr->ia_size);
-		if (ret) {
-			btrfs_truncate(inode);
+		/*
+		 * 1 for the orphan item we're going to add
+		 * 1 for the orphan item deletion.
+		 */
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		/*
+		 * We need to do this in case we fail at _any_ point during the
+		 * actual truncate.  Once we do the truncate_setsize we could
+		 * invalidate pages which forces any outstanding ordered io to
+		 * be instantly completed which will give us extents that need
+		 * to be truncated.  If we fail to get an orphan inode down we
+		 * could have left over extents that were never meant to live,
+		 * so we need to garuntee from this point on that everything
+		 * will be consistent.
+		 */
+		ret = btrfs_orphan_add(trans, inode);
+		btrfs_end_transaction(trans, root);
+		if (ret)
 			return ret;
-		}
 
-		i_size_write(inode, attr->ia_size);
-		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+		/* we don't support swapfiles, so vmtruncate shouldn't fail */
+		truncate_setsize(inode, newsize);
 
-		trans = btrfs_start_transaction(root, 0);
-		BUG_ON(IS_ERR(trans));
-		btrfs_set_trans_block_group(trans, inode);
-		trans->block_rsv = root->orphan_block_rsv;
-		BUG_ON(!trans->block_rsv);
+		/* Disable nonlocked read DIO to avoid the end less truncate */
+		btrfs_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+		btrfs_inode_resume_unlocked_dio(inode);
 
-		ret = btrfs_update_inode(trans, root, inode);
-		BUG_ON(ret);
-		if (inode->i_nlink > 0) {
-			ret = btrfs_orphan_del(trans, inode);
-			BUG_ON(ret);
+		ret = btrfs_truncate(inode);
+		if (ret && inode->i_nlink) {
+			int err;
+
+			/*
+			 * failed to truncate, disk_i_size is only adjusted down
+			 * as we remove extents, so it should represent the true
+			 * size of the inode, so reset the in memory size and
+			 * delete our orphan entry.
+			 */
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans)) {
+				btrfs_orphan_del(NULL, inode);
+				return ret;
+			}
+			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+			err = btrfs_orphan_del(trans, inode);
+			if (err)
+				btrfs_abort_transaction(trans, root, err);
+			btrfs_end_transaction(trans, root);
 		}
-		nr = trans->blocks_used;
-		btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(root, nr);
-		return 0;
 	}
 
-	/*
-	 * We're truncating a file that used to have good data down to
-	 * zero. Make sure it gets into the ordered flush list so that
-	 * any new writes get down to disk quickly.
-	 */
-	if (attr->ia_size == 0)
-		BTRFS_I(inode)->ordered_data_close = 1;
-
-	/* we don't support swapfiles, so vmtruncate shouldn't fail */
-	ret = vmtruncate(inode, attr->ia_size);
-	BUG_ON(ret);
-
-	return 0;
+	return ret;
 }
 
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3707,32 +4614,97 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return err;
 
 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-		err = btrfs_setattr_size(inode, attr);
+		err = btrfs_setsize(inode, attr);
 		if (err)
 			return err;
 	}
 
 	if (attr->ia_valid) {
 		setattr_copy(inode, attr);
-		mark_inode_dirty(inode);
+		inode_inc_iversion(inode);
+		err = btrfs_dirty_inode(inode);
 
-		if (attr->ia_valid & ATTR_MODE)
-			err = btrfs_acl_chmod(inode);
+		if (!err && attr->ia_valid & ATTR_MODE)
+			err = posix_acl_chmod(inode, inode->i_mode);
 	}
 
 	return err;
 }
 
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+	struct rb_node *node;
+
+	ASSERT(inode->i_state & I_FREEING);
+	truncate_inode_pages_final(&inode->i_data);
+
+	write_lock(&map_tree->lock);
+	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+		struct extent_map *em;
+
+		node = rb_first(&map_tree->map);
+		em = rb_entry(node, struct extent_map, rb_node);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		remove_extent_mapping(map_tree, em);
+		free_extent_map(em);
+	}
+	write_unlock(&map_tree->lock);
+
+	spin_lock(&io_tree->lock);
+	while (!RB_EMPTY_ROOT(&io_tree->state)) {
+		struct extent_state *state;
+		struct extent_state *cached_state = NULL;
+
+		node = rb_first(&io_tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		atomic_inc(&state->refs);
+		spin_unlock(&io_tree->lock);
+
+		lock_extent_bits(io_tree, state->start, state->end,
+				 0, &cached_state);
+		clear_extent_bit(io_tree, state->start, state->end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+		free_extent_state(state);
+
+		spin_lock(&io_tree->lock);
+	}
+	spin_unlock(&io_tree->lock);
+}
+
 void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	unsigned long nr;
+	struct btrfs_block_rsv *rsv, *global_rsv;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	int ret;
 
-	truncate_inode_pages(&inode->i_data, 0);
-	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-			       root == root->fs_info->tree_root))
+	trace_btrfs_inode_evict(inode);
+
+	evict_inode_truncate_pages(inode);
+
+	if (inode->i_nlink &&
+	    ((btrfs_root_refs(&root->root_item) != 0 &&
+	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+	     btrfs_is_free_space_inode(inode)))
 		goto no_delete;
 
 	if (is_bad_inode(inode)) {
@@ -3743,53 +4715,103 @@ void btrfs_evict_inode(struct inode *inode)
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	if (root->fs_info->log_root_recovering) {
-		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+				 &BTRFS_I(inode)->runtime_flags));
 		goto no_delete;
 	}
 
 	if (inode->i_nlink > 0) {
-		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
 		goto no_delete;
 	}
 
+	ret = btrfs_commit_inode_delayed_inode(inode);
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+	rsv->size = min_size;
+	rsv->failfast = 1;
+	global_rsv = &root->fs_info->global_block_rsv;
+
 	btrfs_i_size_write(inode, 0);
 
+	/*
+	 * This is a bit simpler than btrfs_truncate since we've already
+	 * reserved our space for our orphan item in the unlink, so we just
+	 * need to reserve some slack space in case we add bytes and update
+	 * inode item when doing the truncate.
+	 */
 	while (1) {
-		trans = btrfs_start_transaction(root, 0);
-		BUG_ON(IS_ERR(trans));
-		btrfs_set_trans_block_group(trans, inode);
-		trans->block_rsv = root->orphan_block_rsv;
+		ret = btrfs_block_rsv_refill(root, rsv, min_size,
+					     BTRFS_RESERVE_FLUSH_LIMIT);
+
+		/*
+		 * Try and steal from the global reserve since we will
+		 * likely not use this space anyway, we want to try as
+		 * hard as possible to get this to work.
+		 */
+		if (ret)
+			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
 
-		ret = btrfs_block_rsv_check(trans, root,
-					    root->orphan_block_rsv, 0, 5);
 		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			continue;
+			btrfs_warn(root->fs_info,
+				"Could not get space for a delete, will truncate on mount %d",
+				ret);
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
 		}
 
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
+		}
+
+		trans->block_rsv = rsv;
+
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-		if (ret != -EAGAIN)
+		if (ret != -ENOSPC)
 			break;
 
-		nr = trans->blocks_used;
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
-		btrfs_btree_balance_dirty(root, nr);
-
+		btrfs_btree_balance_dirty(root);
 	}
 
+	btrfs_free_block_rsv(root, rsv);
+
+	/*
+	 * Errors here aren't a big deal, it just means we leave orphan items
+	 * in the tree.  They will be cleaned up on the next mount.
+	 */
 	if (ret == 0) {
-		ret = btrfs_orphan_del(trans, inode);
-		BUG_ON(ret);
+		trans->block_rsv = root->orphan_block_rsv;
+		btrfs_orphan_del(trans, inode);
+	} else {
+		btrfs_orphan_del(NULL, inode);
 	}
 
-	nr = trans->blocks_used;
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	if (!(root == root->fs_info->tree_root ||
+	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+		btrfs_return_ino(root, btrfs_ino(inode));
+
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 no_delete:
-	end_writeback(inode);
+	btrfs_remove_delayed_node(inode);
+	clear_inode(inode);
 	return;
 }
 
@@ -3808,14 +4830,15 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	int ret = 0;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
-	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
 				    namelen, 0);
 	if (IS_ERR(di))
 		ret = PTR_ERR(di);
 
-	if (!di || IS_ERR(di))
+	if (IS_ERR_OR_NULL(di))
 		goto out_err;
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
@@ -3852,9 +4875,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	}
 
 	err = -ENOENT;
-	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
-				  BTRFS_I(dir)->root->root_key.objectid,
-				  location->objectid);
+	ret = btrfs_find_item(root->fs_info->tree_root, path,
+				BTRFS_I(dir)->root->root_key.objectid,
+				location->objectid, BTRFS_ROOT_REF_KEY, NULL);
 	if (ret) {
 		if (ret < 0)
 			err = ret;
@@ -3863,7 +4886,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
-	if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
 		goto out;
 
@@ -3873,7 +4896,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	if (ret)
 		goto out;
 
-	btrfs_release_path(root->fs_info->tree_root, path);
+	btrfs_release_path(path);
 
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
 	if (IS_ERR(new_root)) {
@@ -3881,11 +4904,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 		goto out;
 	}
 
-	if (btrfs_root_refs(&new_root->root_item) == 0) {
-		err = -ENOENT;
-		goto out;
-	}
-
 	*sub_root = new_root;
 	location->objectid = btrfs_root_dirid(&new_root->root_item);
 	location->type = BTRFS_INODE_ITEM_KEY;
@@ -3902,33 +4920,33 @@ static void inode_tree_add(struct inode *inode)
 	struct btrfs_inode *entry;
 	struct rb_node **p;
 	struct rb_node *parent;
-again:
-	p = &root->inode_tree.rb_node;
-	parent = NULL;
+	struct rb_node *new = &BTRFS_I(inode)->rb_node;
+	u64 ino = btrfs_ino(inode);
 
 	if (inode_unhashed(inode))
 		return;
-
+	parent = NULL;
 	spin_lock(&root->inode_lock);
+	p = &root->inode_tree.rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
 
-		if (inode->i_ino < entry->vfs_inode.i_ino)
+		if (ino < btrfs_ino(&entry->vfs_inode))
 			p = &parent->rb_left;
-		else if (inode->i_ino > entry->vfs_inode.i_ino)
+		else if (ino > btrfs_ino(&entry->vfs_inode))
 			p = &parent->rb_right;
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
 				  (I_WILL_FREE | I_FREEING)));
-			rb_erase(parent, &root->inode_tree);
+			rb_replace_node(parent, new, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
-			goto again;
+			return;
 		}
 	}
-	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
-	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+	rb_link_node(new, parent, p);
+	rb_insert_color(new, &root->inode_tree);
 	spin_unlock(&root->inode_lock);
 }
 
@@ -3945,14 +4963,7 @@ static void inode_tree_del(struct inode *inode)
 	}
 	spin_unlock(&root->inode_lock);
 
-	/*
-	 * Free space cache has inodes in the tree root, but the tree root has a
-	 * root_refs of 0, so this could end up dropping the tree root as a
-	 * snapshot, so we need the extra !root->fs_info->tree_root check to
-	 * make sure we don't drop it.
-	 */
-	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root) {
+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
 		synchronize_srcu(&root->fs_info->subvol_srcu);
 		spin_lock(&root->inode_lock);
 		empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -3962,7 +4973,7 @@ static void inode_tree_del(struct inode *inode)
 	}
 }
 
-int btrfs_invalidate_inodes(struct btrfs_root *root)
+void btrfs_invalidate_inodes(struct btrfs_root *root)
 {
 	struct rb_node *node;
 	struct rb_node *prev;
@@ -3970,7 +4981,8 @@ int btrfs_invalidate_inodes(struct btrfs_root *root)
 	struct inode *inode;
 	u64 objectid = 0;
 
-	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+	if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 
 	spin_lock(&root->inode_lock);
 again:
@@ -3980,9 +4992,9 @@ again:
 		prev = node;
 		entry = rb_entry(node, struct btrfs_inode, rb_node);
 
-		if (objectid < entry->vfs_inode.i_ino)
+		if (objectid < btrfs_ino(&entry->vfs_inode))
 			node = node->rb_left;
-		else if (objectid > entry->vfs_inode.i_ino)
+		else if (objectid > btrfs_ino(&entry->vfs_inode))
 			node = node->rb_right;
 		else
 			break;
@@ -3990,7 +5002,7 @@ again:
 	if (!node) {
 		while (prev) {
 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
-			if (objectid <= entry->vfs_inode.i_ino) {
+			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
 				node = prev;
 				break;
 			}
@@ -3999,7 +5011,7 @@ again:
 	}
 	while (node) {
 		entry = rb_entry(node, struct btrfs_inode, rb_node);
-		objectid = entry->vfs_inode.i_ino + 1;
+		objectid = btrfs_ino(&entry->vfs_inode) + 1;
 		inode = igrab(&entry->vfs_inode);
 		if (inode) {
 			spin_unlock(&root->inode_lock);
@@ -4022,35 +5034,37 @@ again:
 		node = rb_next(node);
 	}
 	spin_unlock(&root->inode_lock);
-	return 0;
 }
 
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct btrfs_iget_args *args = p;
-	inode->i_ino = args->ino;
+	inode->i_ino = args->location->objectid;
+	memcpy(&BTRFS_I(inode)->location, args->location,
+	       sizeof(*args->location));
 	BTRFS_I(inode)->root = args->root;
-	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
-	return args->ino == inode->i_ino &&
+	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
 		args->root == BTRFS_I(inode)->root;
 }
 
 static struct inode *btrfs_iget_locked(struct super_block *s,
-				       u64 objectid,
+				       struct btrfs_key *location,
 				       struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
-	args.ino = objectid;
+	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+
+	args.location = location;
 	args.root = root;
 
-	inode = iget5_locked(s, objectid, btrfs_find_actor,
+	inode = iget5_locked(s, hashval, btrfs_find_actor,
 			     btrfs_init_locked_inode,
 			     (void *)&args);
 	return inode;
@@ -4064,19 +5078,22 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 {
 	struct inode *inode;
 
-	inode = btrfs_iget_locked(s, location->objectid, root);
+	inode = btrfs_iget_locked(s, location, root);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
-		BTRFS_I(inode)->root = root;
-		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
-
-		inode_tree_add(inode);
-		unlock_new_inode(inode);
-		if (new)
-			*new = 1;
+		if (!is_bad_inode(inode)) {
+			inode_tree_add(inode);
+			unlock_new_inode(inode);
+			if (new)
+				*new = 1;
+		} else {
+			unlock_new_inode(inode);
+			iput(inode);
+			inode = ERR_PTR(-ESTALE);
+		}
 	}
 
 	return inode;
@@ -4093,10 +5110,10 @@ static struct inode *new_simple_dir(struct super_block *s,
 
 	BTRFS_I(inode)->root = root;
 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
-	BTRFS_I(inode)->dummy_inode = 1;
+	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
 
 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &btrfs_dir_ro_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4111,18 +5128,17 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
 	int index;
-	int ret;
+	int ret = 0;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	ret = btrfs_inode_by_name(dir, dentry, &location);
-
 	if (ret < 0)
 		return ERR_PTR(ret);
 
 	if (location.objectid == 0)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
@@ -4147,8 +5163,12 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	if (!IS_ERR(inode) && root != sub_root) {
 		down_read(&root->fs_info->cleanup_work_sem);
 		if (!(inode->i_sb->s_flags & MS_RDONLY))
-			btrfs_orphan_cleanup(sub_root);
+			ret = btrfs_orphan_cleanup(sub_root);
 		up_read(&root->fs_info->cleanup_work_sem);
+		if (ret) {
+			iput(inode);
+			inode = ERR_PTR(ret);
+		}
 	}
 
 	return inode;
@@ -4157,49 +5177,61 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 static int btrfs_dentry_delete(const struct dentry *dentry)
 {
 	struct btrfs_root *root;
+	struct inode *inode = dentry->d_inode;
 
-	if (!dentry->d_inode && !IS_ROOT(dentry))
-		dentry = dentry->d_parent;
+	if (!inode && !IS_ROOT(dentry))
+		inode = dentry->d_parent->d_inode;
 
-	if (dentry->d_inode) {
-		root = BTRFS_I(dentry->d_inode)->root;
+	if (inode) {
+		root = BTRFS_I(inode)->root;
 		if (btrfs_root_refs(&root->root_item) == 0)
 			return 1;
+
+		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+			return 1;
 	}
 	return 0;
 }
 
+static void btrfs_dentry_release(struct dentry *dentry)
+{
+	kfree(dentry->d_fsdata);
+}
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct inode *inode;
 
 	inode = btrfs_lookup_dentry(dir, dentry);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
+	if (IS_ERR(inode)) {
+		if (PTR_ERR(inode) == -ENOENT)
+			inode = NULL;
+		else
+			return ERR_CAST(inode);
+	}
 
-	return d_splice_alias(inode, dentry);
+	return d_materialise_unique(dentry, inode);
 }
 
-static unsigned char btrfs_filetype_table[] = {
+unsigned char btrfs_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int btrfs_real_readdir(struct file *filp, void *dirent,
-			      filldir_t filldir)
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
+	struct list_head ins_list;
+	struct list_head del_list;
 	int ret;
-	u32 nritems;
 	struct extent_buffer *leaf;
 	int slot;
-	int advance;
 	unsigned char d_type;
 	int over = 0;
 	u32 di_cur;
@@ -4209,71 +5241,63 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 	char tmp_name[32];
 	char *name_ptr;
 	int name_len;
+	int is_curr = 0;	/* ctx->pos points to the current index? */
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
 
-	/* special case for "." */
-	if (filp->f_pos == 0) {
-		over = filldir(dirent, ".", 1,
-			       1, inode->i_ino,
-			       DT_DIR);
-		if (over)
-			return 0;
-		filp->f_pos = 1;
-	}
-	/* special case for .., just use the back ref */
-	if (filp->f_pos == 1) {
-		u64 pino = parent_ino(filp->f_path.dentry);
-		over = filldir(dirent, "..", 2,
-			       2, pino, DT_DIR);
-		if (over)
-			return 0;
-		filp->f_pos = 2;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
 	path = btrfs_alloc_path();
-	path->reada = 2;
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		INIT_LIST_HEAD(&ins_list);
+		INIT_LIST_HEAD(&del_list);
+		btrfs_get_delayed_items(inode, &ins_list, &del_list);
+	}
 
 	btrfs_set_key_type(&key, key_type);
-	key.offset = filp->f_pos;
-	key.objectid = inode->i_ino;
+	key.offset = ctx->pos;
+	key.objectid = btrfs_ino(inode);
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	advance = 0;
 
 	while (1) {
 		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
-		if (advance || slot >= nritems) {
-			if (slot >= nritems - 1) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret)
-					break;
-				leaf = path->nodes[0];
-				nritems = btrfs_header_nritems(leaf);
-				slot = path->slots[0];
-			} else {
-				slot++;
-				path->slots[0]++;
-			}
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			else if (ret > 0)
+				break;
+			continue;
 		}
 
-		advance = 1;
-		item = btrfs_item_nr(leaf, slot);
+		item = btrfs_item_nr(slot);
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		if (found_key.objectid != key.objectid)
 			break;
 		if (btrfs_key_type(&found_key) != key_type)
 			break;
-		if (found_key.offset < filp->f_pos)
-			continue;
+		if (found_key.offset < ctx->pos)
+			goto next;
+		if (key_type == BTRFS_DIR_INDEX_KEY &&
+		    btrfs_should_delete_dir_index(&del_list,
+						  found_key.offset))
+			goto next;
 
-		filp->f_pos = found_key.offset;
+		ctx->pos = found_key.offset;
+		is_curr = 1;
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		di_cur = 0;
@@ -4282,6 +5306,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 		while (di_cur < di_total) {
 			struct btrfs_key location;
 
+			if (verify_dir_item(root, leaf, di))
+				break;
+
 			name_len = btrfs_dir_name_len(leaf, di);
 			if (name_len <= sizeof(tmp_name)) {
 				name_ptr = tmp_name;
@@ -4298,17 +5325,23 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
 
+
 			/* is this a reference to our own snapshot? If so
-			 * skip it
+			 * skip it.
+			 *
+			 * In contrast to old kernels, we insert the snapshot's
+			 * dir item and dir index after it has been created, so
+			 * we won't find a reference to our own snapshot. We
+			 * still keep the following code for backward
+			 * compatibility.
 			 */
 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
 			    location.objectid == root->root_key.objectid) {
 				over = 0;
 				goto skip;
 			}
-			over = filldir(dirent, name_ptr, name_len,
-				       found_key.offset, location.objectid,
-				       d_type);
+			over = !dir_emit(ctx, name_ptr, name_len,
+				       location.objectid, d_type);
 
 skip:
 			if (name_ptr != tmp_name)
@@ -4321,20 +5354,49 @@ skip:
 			di_cur += di_len;
 			di = (struct btrfs_dir_item *)((char *)di + di_len);
 		}
+next:
+		path->slots[0]++;
+	}
+
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		if (is_curr)
+			ctx->pos++;
+		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+		if (ret)
+			goto nopos;
 	}
 
 	/* Reached end of directory/root. Bump pos past the last item. */
-	if (key_type == BTRFS_DIR_INDEX_KEY)
-		/*
-		 * 32-bit glibc will use getdents64, but then strtol -
-		 * so the last number we can serve is this.
-		 */
-		filp->f_pos = 0x7fffffff;
-	else
-		filp->f_pos++;
+	ctx->pos++;
+
+	/*
+	 * Stop new entries from being returned after we return the last
+	 * entry.
+	 *
+	 * New directory entries are assigned a strictly increasing
+	 * offset.  This means that new entries created during readdir
+	 * are *guaranteed* to be seen in the future by that readdir.
+	 * This has broken buggy programs which operate on names as
+	 * they're returned by readdir.  Until we re-use freed offsets
+	 * we have this hack to stop new entries from being returned
+	 * under the assumption that they'll never reach this huge
+	 * offset.
+	 *
+	 * This is being careful not to overflow 32bit loff_t unless the
+	 * last entry requires it because doing so has broken 32bit apps
+	 * in the past.
+	 */
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		if (ctx->pos >= INT_MAX)
+			ctx->pos = LLONG_MAX;
+		else
+			ctx->pos = INT_MAX;
+	}
 nopos:
 	ret = 0;
 err:
+	if (key_type == BTRFS_DIR_INDEX_KEY)
+		btrfs_put_delayed_items(&ins_list, &del_list);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -4346,24 +5408,20 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret = 0;
 	bool nolock = false;
 
-	if (BTRFS_I(inode)->dummy_inode)
+	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
 		return 0;
 
-	smp_mb();
-	nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
+		nolock = true;
 
 	if (wbc->sync_mode == WB_SYNC_ALL) {
 		if (nolock)
-			trans = btrfs_join_transaction_nolock(root, 1);
+			trans = btrfs_join_transaction_nolock(root);
 		else
-			trans = btrfs_join_transaction(root, 1);
+			trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
-		btrfs_set_trans_block_group(trans, inode);
-		if (nolock)
-			ret = btrfs_end_transaction_nolock(trans, root);
-		else
-			ret = btrfs_commit_transaction(trans, root);
+		ret = btrfs_commit_transaction(trans, root);
 	}
 	return ret;
 }
@@ -4374,44 +5432,57 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-void btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	if (BTRFS_I(inode)->dummy_inode)
-		return;
+	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+		return 0;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	ret = btrfs_update_inode(trans, root, inode);
 	if (ret && ret == -ENOSPC) {
 		/* whoops, lets try again with the full transaction */
 		btrfs_end_transaction(trans, root);
 		trans = btrfs_start_transaction(root, 1);
-		if (IS_ERR(trans)) {
-			if (printk_ratelimit()) {
-				printk(KERN_ERR "btrfs: fail to "
-				       "dirty  inode %lu error %ld\n",
-				       inode->i_ino, PTR_ERR(trans));
-			}
-			return;
-		}
-		btrfs_set_trans_block_group(trans, inode);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
 
 		ret = btrfs_update_inode(trans, root, inode);
-		if (ret) {
-			if (printk_ratelimit()) {
-				printk(KERN_ERR "btrfs: fail to "
-				       "dirty  inode %lu error %d\n",
-				       inode->i_ino, ret);
-			}
-		}
 	}
 	btrfs_end_transaction(trans, root);
+	if (BTRFS_I(inode)->delayed_node)
+		btrfs_balance_delayed_items(root);
+
+	return ret;
+}
+
+/*
+ * This is a copy of file_update_time.  We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+			     int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	if (flags & S_VERSION)
+		inode_inc_iversion(inode);
+	if (flags & S_CTIME)
+		inode->i_ctime = *now;
+	if (flags & S_MTIME)
+		inode->i_mtime = *now;
+	if (flags & S_ATIME)
+		inode->i_atime = *now;
+	return btrfs_dirty_inode(inode);
 }
 
 /*
@@ -4427,7 +5498,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
 	struct extent_buffer *leaf;
 	int ret;
 
-	key.objectid = inode->i_ino;
+	key.objectid = btrfs_ino(inode);
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
 	key.offset = (u64)-1;
 
@@ -4459,7 +5530,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-	if (found_key.objectid != inode->i_ino ||
+	if (found_key.objectid != btrfs_ino(inode) ||
 	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
 		BTRFS_I(inode)->index_cnt = 2;
 		goto out;
@@ -4480,9 +5551,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
 	int ret = 0;
 
 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
-		ret = btrfs_set_inode_index_count(dir);
-		if (ret)
-			return ret;
+		ret = btrfs_inode_delayed_dir_index_count(dir);
+		if (ret) {
+			ret = btrfs_set_inode_index_count(dir);
+			if (ret)
+				return ret;
+		}
 	}
 
 	*index = BTRFS_I(dir)->index_cnt;
@@ -4496,7 +5570,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct inode *dir,
 				     const char *name, int name_len,
 				     u64 ref_objectid, u64 objectid,
-				     u64 alloc_hint, int mode, u64 *index)
+				     umode_t mode, u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
@@ -4505,23 +5579,37 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_ref *ref;
 	struct btrfs_key key[2];
 	u32 sizes[2];
+	int nitems = name ? 2 : 1;
 	unsigned long ptr;
 	int ret;
-	int owner;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return ERR_PTR(-ENOMEM);
 
 	inode = new_inode(root->fs_info->sb);
-	if (!inode)
+	if (!inode) {
+		btrfs_free_path(path);
 		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * we have to initialize this early, so we can reclaim the inode
+	 * number if we fail afterwards in this function.
+	 */
+	inode->i_ino = objectid;
+
+	if (dir && name) {
+		trace_btrfs_inode_request(dir);
 
-	if (dir) {
 		ret = btrfs_set_inode_index(dir, index);
 		if (ret) {
+			btrfs_free_path(path);
 			iput(inode);
 			return ERR_PTR(ret);
 		}
+	} else if (dir) {
+		*index = 0;
 	}
 	/*
 	 * index_cnt is ignored for everything but a dir,
@@ -4529,48 +5617,61 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * number
 	 */
 	BTRFS_I(inode)->index_cnt = 2;
+	BTRFS_I(inode)->dir_index = *index;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
 	inode->i_generation = BTRFS_I(inode)->generation;
-	btrfs_set_inode_space_info(root, inode);
 
-	if (mode & S_IFDIR)
-		owner = 0;
-	else
-		owner = 1;
-	BTRFS_I(inode)->block_group =
-			btrfs_find_block_group(root, 0, alloc_hint, owner);
+	/*
+	 * We could have gotten an inode number from somebody who was fsynced
+	 * and then removed in this same transaction, so let's just set full
+	 * sync since it will be a full sync anyway and this will blow away the
+	 * old info in the log.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
 	key[0].offset = 0;
 
-	key[1].objectid = objectid;
-	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
-	key[1].offset = ref_objectid;
-
 	sizes[0] = sizeof(struct btrfs_inode_item);
-	sizes[1] = name_len + sizeof(*ref);
+
+	if (name) {
+		/*
+		 * Start new inodes with an inode_ref. This is slightly more
+		 * efficient for small numbers of hard links since they will
+		 * be packed into one item. Extended refs will kick in if we
+		 * add more hard links than can fit in the ref item.
+		 */
+		key[1].objectid = objectid;
+		btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+		key[1].offset = ref_objectid;
+
+		sizes[1] = name_len + sizeof(*ref);
+	}
 
 	path->leave_spinning = 1;
-	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
 	if (ret != 0)
 		goto fail;
 
 	inode_init_owner(inode, dir, mode);
-	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
+	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 
-	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
-			     struct btrfs_inode_ref);
-	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
-	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
-	ptr = (unsigned long)(ref + 1);
-	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	if (name) {
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+				     struct btrfs_inode_ref);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+		ptr = (unsigned long)(ref + 1);
+		write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	}
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
@@ -4582,18 +5683,31 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_inherit_iflags(inode, dir);
 
-	if ((mode & S_IFREG)) {
+	if (S_ISREG(mode)) {
 		if (btrfs_test_opt(root, NODATASUM))
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 		if (btrfs_test_opt(root, NODATACOW))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+				BTRFS_INODE_NODATASUM;
 	}
 
-	insert_inode_hash(inode);
+	btrfs_insert_inode_hash(inode);
 	inode_tree_add(inode);
+
+	trace_btrfs_inode_new(inode);
+	btrfs_set_inode_last_trans(trans, inode);
+
+	btrfs_update_root_times(trans, root);
+
+	ret = btrfs_inode_inherit_props(trans, inode, dir);
+	if (ret)
+		btrfs_err(root->fs_info,
+			  "error inheriting props for ino %llu (root %llu): %d",
+			  btrfs_ino(inode), root->root_key.objectid, ret);
+
 	return inode;
 fail:
-	if (dir)
+	if (dir && name)
 		BTRFS_I(dir)->index_cnt--;
 	btrfs_free_path(path);
 	iput(inode);
@@ -4618,36 +5732,63 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+	u64 ino = btrfs_ino(inode);
+	u64 parent_ino = btrfs_ino(parent_inode);
 
-	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
 	} else {
-		key.objectid = inode->i_ino;
+		key.objectid = ino;
 		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 		key.offset = 0;
 	}
 
-	if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
 					 key.objectid, root->root_key.objectid,
-					 parent_inode->i_ino,
-					 index, name, name_len);
+					 parent_ino, index, name, name_len);
 	} else if (add_backref) {
-		ret = btrfs_insert_inode_ref(trans, root,
-					     name, name_len, inode->i_ino,
-					     parent_inode->i_ino, index);
+		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
+					     parent_ino, index);
 	}
 
-	if (ret == 0) {
-		ret = btrfs_insert_dir_item(trans, root, name, name_len,
-					    parent_inode->i_ino, &key,
-					    btrfs_inode_type(inode), index);
-		BUG_ON(ret);
+	/* Nothing to clean up yet */
+	if (ret)
+		return ret;
+
+	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+				    parent_inode, &key,
+				    btrfs_inode_type(inode), index);
+	if (ret == -EEXIST || ret == -EOVERFLOW)
+		goto fail_dir_item;
+	else if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+			   name_len * 2);
+	inode_inc_iversion(parent_inode);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, parent_inode);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	return ret;
+
+fail_dir_item:
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		u64 local_index;
+		int err;
+		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 key.objectid, root->root_key.objectid,
+				 parent_ino, &local_index, name, name_len);
 
-		btrfs_i_size_write(parent_inode, parent_inode->i_size +
-				   name_len * 2);
-		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_update_inode(trans, root, parent_inode);
+	} else if (add_backref) {
+		u64 local_index;
+		int err;
+
+		err = btrfs_del_inode_ref(trans, root, name, name_len,
+					  ino, parent_ino, &local_index);
 	}
 	return ret;
 }
@@ -4659,17 +5800,13 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 	int err = btrfs_add_link(trans, dir, inode,
 				 dentry->d_name.name, dentry->d_name.len,
 				 backref, index);
-	if (!err) {
-		d_instantiate(dentry, inode);
-		return 0;
-	}
 	if (err > 0)
 		err = -EEXIST;
 	return err;
 }
 
 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
-			int mode, dev_t rdev)
+			umode_t mode, dev_t rdev)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -4677,16 +5814,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	unsigned long nr = 0;
 	u64 index = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-	if (err)
-		return err;
-
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
@@ -4696,14 +5828,17 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, mode, &index);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
 		goto out_unlock;
+	}
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err) {
@@ -4711,21 +5846,26 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+
+	inode->i_op = &btrfs_special_inode_operations;
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
-		inode->i_op = &btrfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, rdev);
 		btrfs_update_inode(trans, root, inode);
+		d_instantiate(dentry, inode);
 	}
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4734,20 +5874,16 @@ out_unlock:
 }
 
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
-			int mode, struct nameidata *nd)
+			umode_t mode, bool excl)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = NULL;
-	int drop_inode = 0;
+	int drop_inode_on_err = 0;
 	int err;
-	unsigned long nr = 0;
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-	if (err)
-		return err;
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
@@ -4757,42 +5893,53 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, mode, &index);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				mode, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
 		goto out_unlock;
+	}
+	drop_inode_on_err = 1;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-	if (err) {
-		drop_inode = 1;
+	if (err)
 		goto out_unlock;
-	}
 
-	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_unlock;
+
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
-		drop_inode = 1;
-	else {
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
-		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-	}
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
+		goto out_unlock;
+
+	inode->i_mapping->a_ops = &btrfs_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	d_instantiate(dentry, inode);
+
 out_unlock:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
-	if (drop_inode) {
+	btrfs_end_transaction(trans, root);
+	if (err && drop_inode_on_err) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -4803,16 +5950,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
 	u64 index;
-	unsigned long nr = 0;
 	int err;
 	int drop_inode = 0;
 
 	/* do not allow sys_link's with other subvols of the same device */
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
-		return -EPERM;
+		return -EXDEV;
 
-	btrfs_inc_nlink(inode);
-	inode->i_ctime = CURRENT_TIME;
+	if (inode->i_nlink >= BTRFS_LINK_MAX)
+		return -EMLINK;
 
 	err = btrfs_set_inode_index(dir, &index);
 	if (err)
@@ -4829,34 +5975,48 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 	}
 
-	btrfs_set_trans_block_group(trans, dir);
+	/* There are several dir indexes for this inode, clear the cache. */
+	BTRFS_I(inode)->dir_index = 0ULL;
+	inc_nlink(inode);
+	inode_inc_iversion(inode);
+	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
 
 	if (err) {
 		drop_inode = 1;
 	} else {
-		struct dentry *parent = dget_parent(dentry);
-		btrfs_update_inode_block_group(trans, dir);
+		struct dentry *parent = dentry->d_parent;
 		err = btrfs_update_inode(trans, root, inode);
-		BUG_ON(err);
+		if (err)
+			goto fail;
+		if (inode->i_nlink == 1) {
+			/*
+			 * If new hard link count is 1, it's a file created
+			 * with open(2) O_TMPFILE flag.
+			 */
+			err = btrfs_orphan_del(trans, inode);
+			if (err)
+				goto fail;
+		}
+		d_instantiate(dentry, inode);
 		btrfs_log_new_name(trans, inode, NULL, parent);
-		dput(parent);
 	}
 
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode = NULL;
 	struct btrfs_trans_handle *trans;
@@ -4865,11 +6025,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int drop_on_err = 0;
 	u64 objectid = 0;
 	u64 index = 0;
-	unsigned long nr = 1;
-
-	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-	if (err)
-		return err;
 
 	/*
 	 * 2 items for inode and ref
@@ -4879,12 +6034,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_fail;
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFDIR | mode,
-				&index);
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				S_IFDIR | mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -4898,7 +6055,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
-	btrfs_set_trans_block_group(trans, inode);
 
 	btrfs_i_size_write(inode, 0);
 	err = btrfs_update_inode(trans, root, inode);
@@ -4912,15 +6068,13 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 
 out_fail:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -4944,7 +6098,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 		em->block_start += start_diff;
 		em->block_len -= start_diff;
 	}
-	return add_extent_mapping(em_tree, em);
+	return add_extent_mapping(em_tree, em, 0);
 }
 
 static noinline int uncompress_inline(struct btrfs_path *path,
@@ -4964,8 +6118,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 	inline_size = btrfs_file_extent_inline_item_len(leaf,
-					btrfs_item_nr(leaf, path->slots[0]));
+					btrfs_item_nr(path->slots[0]));
 	tmp = kmalloc(inline_size, GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
 	ptr = btrfs_file_extent_inline_start(item);
 
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
@@ -4973,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
 	ret = btrfs_decompress(compress_type, tmp, page,
 			       extent_offset, inline_size, max_size);
-	if (ret) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
-		unsigned long copy_size = min_t(u64,
-				  PAGE_CACHE_SIZE - pg_offset,
-				  max_size - extent_offset);
-		memset(kaddr + pg_offset, 0, copy_size);
-		kunmap_atomic(kaddr, KM_USER0);
-	}
 	kfree(tmp);
-	return 0;
+	return ret;
 }
 
 /*
@@ -5000,10 +6148,9 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 {
 	int ret;
 	int err = 0;
-	u64 bytenr;
 	u64 extent_start = 0;
 	u64 extent_end = 0;
-	u64 objectid = inode->i_ino;
+	u64 objectid = btrfs_ino(inode);
 	u32 found_type;
 	struct btrfs_path *path = NULL;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5014,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
-	int compress_type;
+	const bool new_inline = !page || create;
 
 again:
 	read_lock(&em_tree->lock);
@@ -5031,7 +6178,7 @@ again:
 		else
 			goto out;
 	}
-	em = alloc_extent_map(GFP_NOFS);
+	em = alloc_extent_map();
 	if (!em) {
 		err = -ENOMEM;
 		goto out;
@@ -5044,7 +6191,15 @@ again:
 
 	if (!path) {
 		path = btrfs_alloc_path();
-		BUG_ON(!path);
+		if (!path) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * Chances are we'll be called again, so go ahead and do
+		 * readahead
+		 */
+		path->reada = 1;
 	}
 
 	ret = btrfs_lookup_file_extent(trans, root, path,
@@ -5068,23 +6223,28 @@ again:
 	found_type = btrfs_key_type(&found_key);
 	if (found_key.objectid != objectid ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		goto not_found;
+		/*
+		 * If we backup past the first extent we want to move forward
+		 * and see if there is an extent in front of us, otherwise we'll
+		 * say there is a hole for our whole search range which can
+		 * cause problems.
+		 */
+		extent_end = start;
+		goto next;
 	}
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
-	compress_type = btrfs_file_extent_compression(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size_t size;
-		size = btrfs_file_extent_inline_len(leaf, item);
-		extent_end = (extent_start + size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
+		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
-
+next:
 	if (start >= extent_end) {
 		path->slots[0]++;
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -5104,34 +6264,15 @@ again:
 		if (start + len <= found_key.offset)
 			goto not_found;
 		em->start = start;
+		em->orig_start = start;
 		em->len = found_key.offset - start;
 		goto not_found_em;
 	}
 
+	btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-		em->start = extent_start;
-		em->len = extent_end - extent_start;
-		em->orig_start = extent_start -
-				 btrfs_file_extent_offset(leaf, item);
-		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
-		if (bytenr == 0) {
-			em->block_start = EXTENT_MAP_HOLE;
-			goto insert;
-		}
-		if (compress_type != BTRFS_COMPRESS_NONE) {
-			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-			em->compress_type = compress_type;
-			em->block_start = bytenr;
-			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
-									 item);
-		} else {
-			bytenr += btrfs_file_extent_offset(leaf, item);
-			em->block_start = bytenr;
-			em->block_len = em->len;
-			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
-				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
-		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		unsigned long ptr;
@@ -5140,25 +6281,17 @@ again:
 		size_t extent_offset;
 		size_t copy_size;
 
-		em->block_start = EXTENT_MAP_INLINE;
-		if (!page || create) {
-			em->start = extent_start;
-			em->len = extent_end - extent_start;
+		if (new_inline)
 			goto out;
-		}
 
-		size = btrfs_file_extent_inline_len(leaf, item);
+		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
 		extent_offset = page_offset(page) + pg_offset - extent_start;
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
-		em->len = (copy_size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-		em->orig_start = EXTENT_MAP_INLINE;
-		if (compress_type) {
-			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-			em->compress_type = compress_type;
-		}
+		em->len = ALIGN(copy_size, root->sectorsize);
+		em->orig_block_len = em->len;
+		em->orig_start = em->start;
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
 			if (btrfs_file_extent_compression(leaf, item) !=
@@ -5166,7 +6299,10 @@ again:
 				ret = uncompress_inline(path, inode, page,
 							pg_offset,
 							extent_offset, item);
-				BUG_ON(ret);
+				if (ret) {
+					err = ret;
+					goto out;
+				}
 			} else {
 				map = kmap(page);
 				read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -5180,13 +6316,15 @@ again:
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
-			WARN_ON(1);
+			BUG();
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
 				em = NULL;
-				btrfs_release_path(root, path);
-				trans = btrfs_join_transaction(root, 1);
+
+				btrfs_release_path(path);
+				trans = btrfs_join_transaction(root);
+
 				if (IS_ERR(trans))
 					return ERR_CAST(trans);
 				goto again;
@@ -5198,33 +6336,28 @@ again:
 			btrfs_mark_buffer_dirty(leaf);
 		}
 		set_extent_uptodate(io_tree, em->start,
-				    extent_map_end(em) - 1, GFP_NOFS);
+				    extent_map_end(em) - 1, NULL, GFP_NOFS);
 		goto insert;
-	} else {
-		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
-		WARN_ON(1);
 	}
 not_found:
 	em->start = start;
+	em->orig_start = start;
 	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
 	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
 insert:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	if (em->start > start || extent_map_end(em) <= start) {
-		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
-		       "[%llu %llu]\n", (unsigned long long)em->start,
-		       (unsigned long long)em->len,
-		       (unsigned long long)start,
-		       (unsigned long long)len);
+		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
+			em->start, em->len, start, len);
 		err = -EIO;
 		goto out;
 	}
 
 	err = 0;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
 	 * an overlapping map exists in the tree
@@ -5265,6 +6398,9 @@ insert:
 	}
 	write_unlock(&em_tree->lock);
 out:
+
+	trace_btrfs_get_extent(root, em);
+
 	if (path)
 		btrfs_free_path(path);
 	if (trans) {
@@ -5276,6 +6412,7 @@ out:
 		free_extent_map(em);
 		return ERR_PTR(err);
 	}
+	BUG_ON(!em); /* Error is always set */
 	return em;
 }
 
@@ -5296,10 +6433,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 		return em;
 	if (em) {
 		/*
-		 * if our em maps to a hole, there might
-		 * actually be delalloc bytes behind it
+		 * if our em maps to
+		 * -  a hole or
+		 * -  a pre-alloc extent,
+		 * there might actually be delalloc bytes behind it.
 		 */
-		if (em->block_start != EXTENT_MAP_HOLE)
+		if (em->block_start != EXTENT_MAP_HOLE &&
+		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			return em;
 		else
 			hole_em = em;
@@ -5334,14 +6474,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 	/* adjust the range_start to make sure it doesn't
 	 * go backwards from the start they passed in
 	 */
-	range_start = max(start,range_start);
+	range_start = max(start, range_start);
 	found = found_end - range_start;
 
 	if (found > 0) {
 		u64 hole_start = start;
 		u64 hole_len = len;
 
-		em = alloc_extent_map(GFP_NOFS);
+		em = alloc_extent_map();
 		if (!em) {
 			err = -ENOMEM;
 			goto out;
@@ -5381,6 +6521,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 			 */
 			em->block_start = hole_em->block_start;
 			em->block_len = hole_len;
+			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		} else {
 			em->start = range_start;
 			em->len = found;
@@ -5405,61 +6547,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 						  u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	struct extent_map *em;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct btrfs_key ins;
 	u64 alloc_hint;
 	int ret;
 
-	btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
-
-	trans = btrfs_join_transaction(root, 0);
-	if (IS_ERR(trans))
-		return ERR_CAST(trans);
-
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
-	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
-				   alloc_hint, (u64)-1, &ins, 1);
-	if (ret) {
-		em = ERR_PTR(ret);
-		goto out;
-	}
-
-	em = alloc_extent_map(GFP_NOFS);
-	if (!em) {
-		em = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	em->start = start;
-	em->orig_start = em->start;
-	em->len = ins.offset;
-
-	em->block_start = ins.objectid;
-	em->block_len = ins.offset;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+				   alloc_hint, &ins, 1, 1);
+	if (ret)
+		return ERR_PTR(ret);
 
-	while (1) {
-		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		if (ret != -EEXIST)
-			break;
-		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+			      ins.offset, ins.offset, ins.offset, 0);
+	if (IS_ERR(em)) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+		return em;
 	}
 
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
 	if (ret) {
-		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
-		em = ERR_PTR(ret);
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+		free_extent_map(em);
+		return ERR_PTR(ret);
 	}
-out:
-	btrfs_end_transaction(trans, root);
+
 	return em;
 }
 
@@ -5467,13 +6580,16 @@ out:
  * returns 1 when the nocow is safe, < 1 on error, 0 if the
  * block must be cow'd
  */
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
-				      struct inode *inode, u64 offset, u64 len)
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	int ret;
 	struct extent_buffer *leaf;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	u64 disk_bytenr;
@@ -5482,12 +6598,13 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 	u64 num_bytes;
 	int slot;
 	int found_type;
+	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 				       offset, 0);
 	if (ret < 0)
 		goto out;
@@ -5504,7 +6621,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 	ret = 0;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, slot);
-	if (key.objectid != inode->i_ino ||
+	if (key.objectid != btrfs_ino(inode) ||
 	    key.type != BTRFS_EXTENT_DATA_KEY) {
 		/* not our file or wrong item type, must cow */
 		goto out;
@@ -5522,25 +6639,66 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 		/* not a regular extent, must cow */
 		goto out;
 	}
-	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+		goto out;
 
 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-	if (extent_end < offset + len) {
-		/* extent doesn't include our full range, must cow */
+	if (extent_end <= offset)
+		goto out;
+
+	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	if (disk_bytenr == 0)
+		goto out;
+
+	if (btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
 		goto out;
+
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (orig_start) {
+		*orig_start = key.offset - backref_offset;
+		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 	}
 
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
 
+	num_bytes = min(offset + *len, extent_end) - offset;
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 range_end;
+
+		range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+		ret = test_range_bit(io_tree, offset, range_end,
+				     EXTENT_DELALLOC, 0, NULL);
+		if (ret) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
+	btrfs_release_path(path);
+
 	/*
 	 * look for other files referencing this extent, if we
 	 * find any we must cow
 	 */
-	if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-				  key.offset - backref_offset, disk_bytenr))
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
+				    key.offset - backref_offset, disk_bytenr);
+	btrfs_end_transaction(trans, root);
+	if (ret) {
+		ret = 0;
 		goto out;
+	}
 
 	/*
 	 * adjust disk_bytenr and num_bytes to cover just the bytes
@@ -5550,31 +6708,229 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 	 */
 	disk_bytenr += backref_offset;
 	disk_bytenr += offset - key.offset;
-	num_bytes = min(offset + len, extent_end) - offset;
 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
 				goto out;
 	/*
 	 * all of the above have passed, it is safe to overwrite this extent
 	 * without cow
 	 */
+	*len = num_bytes;
 	ret = 1;
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+{
+	struct radix_tree_root *root = &inode->i_mapping->page_tree;
+	int found = false;
+	void **pagep = NULL;
+	struct page *page = NULL;
+	int start_idx;
+	int end_idx;
+
+	start_idx = start >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * end is the last byte in the last page.  end == start is legal
+	 */
+	end_idx = end >> PAGE_CACHE_SHIFT;
+
+	rcu_read_lock();
+
+	/* Most of the code in this while loop is lifted from
+	 * find_get_page.  It's been modified to begin searching from a
+	 * page and return just the first page found in that range.  If the
+	 * found idx is less than or equal to the end idx then we know that
+	 * a page exists.  If no pages are found or if those pages are
+	 * outside of the range then we're fine (yay!) */
+	while (page == NULL &&
+	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+		page = radix_tree_deref_slot(pagep);
+		if (unlikely(!page))
+			break;
+
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page)) {
+				page = NULL;
+				continue;
+			}
+			/*
+			 * Otherwise, shmem/tmpfs must be storing a swap entry
+			 * here as an exceptional entry: so return it without
+			 * attempting to raise page count.
+			 */
+			page = NULL;
+			break; /* TODO: Is this relevant for this use case? */
+		}
+
+		if (!page_cache_get_speculative(page)) {
+			page = NULL;
+			continue;
+		}
+
+		/*
+		 * Has the page moved?
+		 * This is part of the lockless pagecache protocol. See
+		 * include/linux/pagemap.h for details.
+		 */
+		if (unlikely(page != *pagep)) {
+			page_cache_release(page);
+			page = NULL;
+		}
+	}
+
+	if (page) {
+		if (page->index <= end_idx)
+			found = true;
+		page_cache_release(page);
+	}
+
+	rcu_read_unlock();
+	return found;
+}
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+			      struct extent_state **cached_state, int writing)
+{
+	struct btrfs_ordered_extent *ordered;
+	int ret = 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, cached_state);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+
+		/*
+		 * We need to make sure there are no buffered pages in this
+		 * range either, we could have raced between the invalidate in
+		 * generic_file_direct_write and locking the extent.  The
+		 * invalidate needs to happen so that reads after a write do not
+		 * get stale data.
+		 */
+		if (!ordered &&
+		    (!writing ||
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+			break;
+
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     cached_state, GFP_NOFS);
+
+		if (ordered) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+		} else {
+			/* Screw you mmap */
+			ret = filemap_write_and_wait_range(inode->i_mapping,
+							   lockstart,
+							   lockend);
+			if (ret)
+				break;
+
+			/*
+			 * If we found a page that couldn't be invalidated just
+			 * fall back to buffered.
+			 */
+			ret = invalidate_inode_pages2_range(inode->i_mapping,
+					lockstart >> PAGE_CACHE_SHIFT,
+					lockend >> PAGE_CACHE_SHIFT);
+			if (ret)
+				break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	em = alloc_extent_map();
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	em->start = start;
+	em->orig_start = orig_start;
+	em->mod_start = start;
+	em->mod_len = len;
+	em->len = len;
+	em->block_len = block_len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->orig_block_len = orig_block_len;
+	em->ram_bytes = ram_bytes;
+	em->generation = -1;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	if (type == BTRFS_ORDERED_PREALLOC)
+		set_bit(EXTENT_FLAG_FILLING, &em->flags);
+
+	do {
+		btrfs_drop_extent_cache(inode, em->start,
+				em->start + em->len - 1, 0);
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em, 1);
+		write_unlock(&em_tree->lock);
+	} while (ret == -EEXIST);
+
+	if (ret) {
+		free_extent_map(em);
+		return ERR_PTR(ret);
+	}
+
+	return em;
+}
+
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
 	u64 start = iblock << inode->i_blkbits;
+	u64 lockstart, lockend;
 	u64 len = bh_result->b_size;
-	struct btrfs_trans_handle *trans;
+	int unlock_bits = EXTENT_LOCKED;
+	int ret = 0;
+
+	if (create)
+		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+	else
+		len = min_t(u64, len, root->sectorsize);
+
+	lockstart = start;
+	lockend = start + len - 1;
+
+	/*
+	 * If this errors out it's because we couldn't invalidate pagecache for
+	 * this range and we need to fallback to buffered.
+	 */
+	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+		return -ENOTBLK;
 
 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
 
 	/*
 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5593,17 +6949,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
 	    em->block_start == EXTENT_MAP_INLINE) {
 		free_extent_map(em);
-		return -ENOTBLK;
+		ret = -ENOTBLK;
+		goto unlock_err;
 	}
 
 	/* Just a good old fashioned hole, return */
 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 		free_extent_map(em);
-		/* DIO will do one hole at a time, so just unlock a sector */
-		unlock_extent(&BTRFS_I(inode)->io_tree, start,
-			      start + root->sectorsize - 1, GFP_NOFS);
-		return 0;
+		goto unlock_err;
 	}
 
 	/*
@@ -5616,8 +6970,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	 *
 	 */
 	if (!create) {
-		len = em->len - (start - em->start);
-		goto map;
+		len = min(len, em->len - (start - em->start));
+		lockstart = start + len;
+		goto unlock;
 	}
 
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5625,7 +6980,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	     em->block_start != EXTENT_MAP_HOLE)) {
 		int type;
 		int ret;
-		u64 block_start;
+		u64 block_start, orig_start, orig_block_len, ram_bytes;
 
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			type = BTRFS_ORDERED_PREALLOC;
@@ -5634,28 +6989,29 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		len = min(len, em->len - (start - em->start));
 		block_start = em->block_start + (start - em->start);
 
-		/*
-		 * we're not going to log anything, but we do need
-		 * to make sure the current transaction stays open
-		 * while we look for nocow cross refs
-		 */
-		trans = btrfs_join_transaction(root, 0);
-		if (IS_ERR(trans))
-			goto must_cow;
+		if (can_nocow_extent(inode, start, &len, &orig_start,
+				     &orig_block_len, &ram_bytes) == 1) {
+			if (type == BTRFS_ORDERED_PREALLOC) {
+				free_extent_map(em);
+				em = create_pinned_em(inode, start, len,
+						       orig_start,
+						       block_start, len,
+						       orig_block_len,
+						       ram_bytes, type);
+				if (IS_ERR(em))
+					goto unlock_err;
+			}
 
-		if (can_nocow_odirect(trans, inode, start, len) == 1) {
 			ret = btrfs_add_ordered_extent_dio(inode, start,
 					   block_start, len, len, type);
-			btrfs_end_transaction(trans, root);
 			if (ret) {
 				free_extent_map(em);
-				return ret;
+				goto unlock_err;
 			}
 			goto unlock;
 		}
-		btrfs_end_transaction(trans, root);
 	}
-must_cow:
+
 	/*
 	 * this will cow the extent, reset the len in case we changed
 	 * it above
@@ -5663,56 +7019,74 @@ must_cow:
 	len = bh_result->b_size;
 	free_extent_map(em);
 	em = btrfs_new_extent_direct(inode, start, len);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
 	len = min(len, em->len - (start - em->start));
 unlock:
-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
-			  0, NULL, GFP_NOFS);
-map:
 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
 		inode->i_blkbits;
 	bh_result->b_size = len;
 	bh_result->b_bdev = em->bdev;
 	set_buffer_mapped(bh_result);
-	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-		set_buffer_new(bh_result);
+	if (create) {
+		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			set_buffer_new(bh_result);
 
-	free_extent_map(em);
+		/*
+		 * Need to update the i_size under the extent lock so buffered
+		 * readers will get the updated i_size when we unlock.
+		 */
+		if (start + len > i_size_read(inode))
+			i_size_write(inode, start + len);
 
-	return 0;
-}
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents++;
+		spin_unlock(&BTRFS_I(inode)->lock);
 
-struct btrfs_dio_private {
-	struct inode *inode;
-	u64 logical_offset;
-	u64 disk_bytenr;
-	u64 bytes;
-	u32 *csums;
-	void *private;
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockstart + len - 1, EXTENT_DELALLOC, NULL,
+				     &cached_state, GFP_NOFS);
+		BUG_ON(ret);
+	}
 
-	/* number of bios pending for this dio */
-	atomic_t pending_bios;
+	/*
+	 * In the case of write we need to clear and unlock the entire range,
+	 * in the case of read we need to unlock only the end area that we
+	 * aren't using if there is any left over space.
+	 */
+	if (lockstart < lockend) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				 lockend, unlock_bits, 1, 0,
+				 &cached_state, GFP_NOFS);
+	} else {
+		free_extent_state(cached_state);
+	}
 
-	/* IO errors */
-	int errors;
+	free_extent_map(em);
 
-	struct bio *orig_bio;
-};
+	return 0;
+
+unlock_err:
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+	return ret;
+}
 
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
-	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct bio *dio_bio;
+	u32 *csums = (u32 *)dip->csum;
 	u64 start;
-	u32 *private = dip->csums;
+	int i;
 
 	start = dip->logical_offset;
-	do {
+	bio_for_each_segment_all(bvec, bio, i) {
 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 			struct page *page = bvec->bv_page;
 			char *kaddr;
@@ -5720,35 +7094,36 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			unsigned long flags;
 
 			local_irq_save(flags);
-			kaddr = kmap_atomic(page, KM_IRQ0);
-			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+			kaddr = kmap_atomic(page);
+			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
 					       csum, bvec->bv_len);
 			btrfs_csum_final(csum, (char *)&csum);
-			kunmap_atomic(kaddr, KM_IRQ0);
+			kunmap_atomic(kaddr);
 			local_irq_restore(flags);
 
 			flush_dcache_page(bvec->bv_page);
-			if (csum != *private) {
-				printk(KERN_ERR "btrfs csum failed ino %lu off"
-				      " %llu csum %u private %u\n",
-				      inode->i_ino, (unsigned long long)start,
-				      csum, *private);
+			if (csum != csums[i]) {
+				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+					  btrfs_ino(inode), start, csum,
+					  csums[i]);
 				err = -EIO;
 			}
 		}
 
 		start += bvec->bv_len;
-		private++;
-		bvec++;
-	} while (bvec <= bvec_end);
+	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
-		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
-	bio->bi_private = dip->private;
+		      dip->logical_offset + dip->bytes - 1);
+	dio_bio = dip->dio_bio;
 
-	kfree(dip->csums);
 	kfree(dip);
-	dio_end_io(bio, err);
+
+	/* If we had a csum failure make sure to clear the uptodate flag */
+	if (err)
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
 }
 
 static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -5756,11 +7131,10 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 	struct btrfs_dio_private *dip = bio->bi_private;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered = NULL;
-	struct extent_state *cached_state = NULL;
 	u64 ordered_offset = dip->logical_offset;
 	u64 ordered_bytes = dip->bytes;
+	struct bio *dio_bio;
 	int ret;
 
 	if (err)
@@ -5768,72 +7142,13 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 again:
 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
 						   &ordered_offset,
-						   ordered_bytes);
+						   ordered_bytes, !err);
 	if (!ret)
 		goto out_test;
 
-	BUG_ON(!ordered);
-
-	trans = btrfs_join_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		err = -ENOMEM;
-		goto out;
-	}
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
-	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
-		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-		if (!ret)
-			ret = btrfs_update_inode(trans, root, inode);
-		err = ret;
-		goto out;
-	}
-
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-			 ordered->file_offset + ordered->len - 1, 0,
-			 &cached_state, GFP_NOFS);
-
-	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-		ret = btrfs_mark_extent_written(trans, inode,
-						ordered->file_offset,
-						ordered->file_offset +
-						ordered->len);
-		if (ret) {
-			err = ret;
-			goto out_unlock;
-		}
-	} else {
-		ret = insert_reserved_file_extent(trans, inode,
-						  ordered->file_offset,
-						  ordered->start,
-						  ordered->disk_len,
-						  ordered->len,
-						  ordered->len,
-						  0, 0, 0,
-						  BTRFS_FILE_EXTENT_REG);
-		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-				   ordered->file_offset, ordered->len);
-		if (ret) {
-			err = ret;
-			WARN_ON(1);
-			goto out_unlock;
-		}
-	}
-
-	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-	btrfs_ordered_update_i_size(inode, 0, ordered);
-	btrfs_update_inode(trans, root, inode);
-out_unlock:
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-			     ordered->file_offset + ordered->len - 1,
-			     &cached_state, GFP_NOFS);
-out:
-	btrfs_delalloc_release_metadata(inode, ordered->len);
-	btrfs_end_transaction(trans, root);
-	ordered_offset = ordered->file_offset + ordered->len;
-	btrfs_put_ordered_extent(ordered);
-	btrfs_put_ordered_extent(ordered);
-
+	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+	btrfs_queue_work(root->fs_info->endio_write_workers,
+			 &ordered->work);
 out_test:
 	/*
 	 * our bio might span multiple ordered extents.  If we haven't
@@ -5842,14 +7157,19 @@ out_test:
 	if (ordered_offset < dip->logical_offset + dip->bytes) {
 		ordered_bytes = dip->logical_offset + dip->bytes -
 			ordered_offset;
+		ordered = NULL;
 		goto again;
 	}
 out_done:
-	bio->bi_private = dip->private;
+	dio_bio = dip->dio_bio;
 
-	kfree(dip->csums);
 	kfree(dip);
-	dio_end_io(bio, err);
+
+	/* If we had an error make sure to clear the uptodate flag */
+	if (err)
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
 }
 
 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -5859,7 +7179,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 	return 0;
 }
 
@@ -5868,27 +7188,28 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 	struct btrfs_dio_private *dip = bio->bi_private;
 
 	if (err) {
-		printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
-		      "sector %#Lx len %u err no %d\n",
-		      dip->inode->i_ino, bio->bi_rw,
-		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+		btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
+			  "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+		      btrfs_ino(dip->inode), bio->bi_rw,
+		      (unsigned long long)bio->bi_iter.bi_sector,
+		      bio->bi_iter.bi_size, err);
 		dip->errors = 1;
 
 		/*
 		 * before atomic variable goto zero, we must make sure
 		 * dip->errors is perceived to be set.
 		 */
-		smp_mb__before_atomic_dec();
+		smp_mb__before_atomic();
 	}
 
 	/* if there are more bios still pending for this dio, just exit */
 	if (!atomic_dec_and_test(&dip->pending_bios))
 		goto out;
 
-	if (dip->errors)
+	if (dip->errors) {
 		bio_io_error(dip->orig_bio);
-	else {
-		set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+	} else {
+		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
 		bio_endio(dip->orig_bio, 0);
 	}
 out:
@@ -5904,29 +7225,51 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 					 int rw, u64 file_offset, int skip_sum,
-					 u32 *csums)
+					 int async_submit)
 {
+	struct btrfs_dio_private *dip = bio->bi_private;
 	int write = rw & REQ_WRITE;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
+	if (async_submit)
+		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
 	bio_get(bio);
-	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-	if (ret)
-		goto err;
 
-	if (write && !skip_sum) {
+	if (!write) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+		if (ret)
+			goto err;
+	}
+
+	if (skip_sum)
+		goto map;
+
+	if (write && async_submit) {
 		ret = btrfs_wq_submit_bio(root->fs_info,
 				   inode, rw, bio, 0, 0,
 				   file_offset,
 				   __btrfs_submit_bio_start_direct_io,
 				   __btrfs_submit_bio_done);
 		goto err;
-	} else if (!skip_sum)
-		btrfs_lookup_bio_sums_dio(root, inode, bio,
-					  file_offset, csums);
+	} else if (write) {
+		/*
+		 * If we aren't doing async submit, calculate the csum of the
+		 * bio now.
+		 */
+		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+		if (ret)
+			goto err;
+	} else if (!skip_sum) {
+		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
+						file_offset);
+		if (ret)
+			goto err;
+	}
 
-	ret = btrfs_map_bio(root, rw, bio, 0, 1);
+map:
+	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
 	bio_put(bio);
 	return ret;
@@ -5937,17 +7280,36 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 {
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
 	struct bio *bio;
 	struct bio *orig_bio = dip->orig_bio;
 	struct bio_vec *bvec = orig_bio->bi_io_vec;
-	u64 start_sector = orig_bio->bi_sector;
+	u64 start_sector = orig_bio->bi_iter.bi_sector;
 	u64 file_offset = dip->logical_offset;
 	u64 submit_len = 0;
 	u64 map_length;
 	int nr_pages = 0;
-	u32 *csums = dip->csums;
 	int ret = 0;
+	int async_submit = 0;
+
+	map_length = orig_bio->bi_iter.bi_size;
+	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
+			      &map_length, NULL, 0);
+	if (ret) {
+		bio_put(orig_bio);
+		return -EIO;
+	}
+
+	if (map_length >= orig_bio->bi_iter.bi_size) {
+		bio = orig_bio;
+		goto submit;
+	}
+
+	/* async crcs make it difficult to collect full stripe writes. */
+	if (btrfs_get_alloc_profile(root, 1) &
+	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+		async_submit = 0;
+	else
+		async_submit = 1;
 
 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
 	if (!bio)
@@ -5956,14 +7318,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	bio->bi_end_io = btrfs_end_dio_bio;
 	atomic_inc(&dip->pending_bios);
 
-	map_length = orig_bio->bi_size;
-	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
-			      &map_length, NULL, 0);
-	if (ret) {
-		bio_put(bio);
-		return -EIO;
-	}
-
 	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
 		if (unlikely(map_length < submit_len + bvec->bv_len ||
 		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -5977,15 +7331,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			atomic_inc(&dip->pending_bios);
 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
 						     file_offset, skip_sum,
-						     csums);
+						     async_submit);
 			if (ret) {
 				bio_put(bio);
 				atomic_dec(&dip->pending_bios);
 				goto out_err;
 			}
 
-			if (!skip_sum)
-				csums = csums + nr_pages;
 			start_sector += submit_len >> 9;
 			file_offset += submit_len;
 
@@ -5999,8 +7351,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 
-			map_length = orig_bio->bi_size;
-			ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+			map_length = orig_bio->bi_iter.bi_size;
+			ret = btrfs_map_block(root->fs_info, rw,
+					      start_sector << 9,
 					      &map_length, NULL, 0);
 			if (ret) {
 				bio_put(bio);
@@ -6008,13 +7361,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			}
 		} else {
 			submit_len += bvec->bv_len;
-			nr_pages ++;
+			nr_pages++;
 			bvec++;
 		}
 	}
 
+submit:
 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-				     csums);
+				     async_submit);
 	if (!ret)
 		return 0;
 
@@ -6025,7 +7379,7 @@ out_err:
 	 * before atomic variable goto zero, we must
 	 * make sure dip->errors is perceived to be set.
 	 */
-	smp_mb__before_atomic_dec();
+	smp_mb__before_atomic();
 	if (atomic_dec_and_test(&dip->pending_bios))
 		bio_io_error(dip->orig_bio);
 
@@ -6033,58 +7387,64 @@ out_err:
 	return 0;
 }
 
-static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
-				loff_t file_offset)
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+				struct inode *inode, loff_t file_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_private *dip;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio *io_bio;
 	int skip_sum;
+	int sum_len;
 	int write = rw & REQ_WRITE;
 	int ret = 0;
+	u16 csum_size;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	dip = kmalloc(sizeof(*dip), GFP_NOFS);
-	if (!dip) {
+	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+	if (!io_bio) {
 		ret = -ENOMEM;
 		goto free_ordered;
 	}
-	dip->csums = NULL;
 
-	if (!skip_sum) {
-		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
-		if (!dip->csums) {
-			kfree(dip);
-			ret = -ENOMEM;
-			goto free_ordered;
-		}
+	if (!skip_sum && !write) {
+		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+		sum_len = dio_bio->bi_iter.bi_size >>
+			inode->i_sb->s_blocksize_bits;
+		sum_len *= csum_size;
+	} else {
+		sum_len = 0;
 	}
 
-	dip->private = bio->bi_private;
+	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+	if (!dip) {
+		ret = -ENOMEM;
+		goto free_io_bio;
+	}
+
+	dip->private = dio_bio->bi_private;
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
-
-	dip->bytes = 0;
-	do {
-		dip->bytes += bvec->bv_len;
-		bvec++;
-	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
-
-	dip->disk_bytenr = (u64)bio->bi_sector << 9;
-	bio->bi_private = dip;
+	dip->bytes = dio_bio->bi_iter.bi_size;
+	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+	io_bio->bi_private = dip;
 	dip->errors = 0;
-	dip->orig_bio = bio;
+	dip->orig_bio = io_bio;
+	dip->dio_bio = dio_bio;
 	atomic_set(&dip->pending_bios, 0);
 
 	if (write)
-		bio->bi_end_io = btrfs_endio_direct_write;
+		io_bio->bi_end_io = btrfs_endio_direct_write;
 	else
-		bio->bi_end_io = btrfs_endio_direct_read;
+		io_bio->bi_end_io = btrfs_endio_direct_read;
 
 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
 	if (!ret)
 		return;
+
+free_io_bio:
+	bio_put(io_bio);
+
 free_ordered:
 	/*
 	 * If this is a write, we need to clean up the reserved space and kill
@@ -6096,134 +7456,127 @@ free_ordered:
 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
 			btrfs_free_reserved_extent(root, ordered->start,
-						   ordered->disk_len);
+						   ordered->disk_len, 1);
 		btrfs_put_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 	}
-	bio_endio(bio, ret);
+	bio_endio(dio_bio, ret);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+			const struct iov_iter *iter, loff_t offset)
 {
 	int seg;
-	size_t size;
-	unsigned long addr;
+	int i;
 	unsigned blocksize_mask = root->sectorsize - 1;
 	ssize_t retval = -EINVAL;
-	loff_t end = offset;
 
 	if (offset & blocksize_mask)
 		goto out;
 
-	/* Check the memory alignment.  Blocks cannot straddle pages */
-	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
-		end += size;
-		if ((addr & blocksize_mask) || (size & blocksize_mask)) 
-			goto out;
+	if (iov_iter_alignment(iter) & blocksize_mask)
+		goto out;
+
+	/* If this is a write we don't need to check anymore */
+	if (rw & WRITE)
+		return 0;
+	/*
+	 * Check to make sure we don't have duplicate iov_base's in this
+	 * iovec, if so return EINVAL, otherwise we'll get csum errors
+	 * when reading back.
+	 */
+	for (seg = 0; seg < iter->nr_segs; seg++) {
+		for (i = seg + 1; i < iter->nr_segs; i++) {
+			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+				goto out;
+		}
 	}
 	retval = 0;
 out:
 	return retval;
 }
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+			struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_state *cached_state = NULL;
-	u64 lockstart, lockend;
+	size_t count = 0;
+	int flags = 0;
+	bool wakeup = true;
+	bool relock = false;
 	ssize_t ret;
-	int writing = rw & WRITE;
-	int write_bits = 0;
-	size_t count = iov_length(iov, nr_segs);
 
-	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-			    offset, nr_segs)) {
+	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
 		return 0;
-	}
 
-	lockstart = offset;
-	lockend = offset + count - 1;
+	atomic_inc(&inode->i_dio_count);
+	smp_mb__after_atomic();
 
-	if (writing) {
-		ret = btrfs_delalloc_reserve_space(inode, count);
-		if (ret)
-			goto out;
-	}
+	/*
+	 * The generic stuff only does filemap_write_and_wait_range, which
+	 * isn't enough if we've written compressed pages to this area, so
+	 * we need to flush the dirty pages again to make absolutely sure
+	 * that any outstanding dirty pages are on disk.
+	 */
+	count = iov_iter_count(iter);
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+		     &BTRFS_I(inode)->runtime_flags))
+		filemap_fdatawrite_range(inode->i_mapping, offset, count);
 
-	while (1) {
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 0, &cached_state, GFP_NOFS);
+	if (rw & WRITE) {
 		/*
-		 * We're concerned with the entire range that we're going to be
-		 * doing DIO to, so we need to make sure theres no ordered
-		 * extents in this range.
+		 * If the write DIO is beyond the EOF, we need update
+		 * the isize, but it is protected by i_mutex. So we can
+		 * not unlock the i_mutex at this case.
 		 */
-		ordered = btrfs_lookup_ordered_range(inode, lockstart,
-						     lockend - lockstart + 1);
-		if (!ordered)
-			break;
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     &cached_state, GFP_NOFS);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-		cond_resched();
-	}
-
-	/*
-	 * we don't use btrfs_set_extent_delalloc because we don't want
-	 * the dirty or uptodate bits
-	 */
-	if (writing) {
-		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
-		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     EXTENT_DELALLOC, 0, NULL, &cached_state,
-				     GFP_NOFS);
-		if (ret) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockend, EXTENT_LOCKED | write_bits,
-					 1, 0, &cached_state, GFP_NOFS);
-			goto out;
+		if (offset + count <= inode->i_size) {
+			mutex_unlock(&inode->i_mutex);
+			relock = true;
 		}
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+				     &BTRFS_I(inode)->runtime_flags))) {
+		inode_dio_done(inode);
+		flags = DIO_LOCKING | DIO_SKIP_HOLES;
+		wakeup = false;
 	}
 
-	free_extent_state(cached_state);
-	cached_state = NULL;
-
 	ret = __blockdev_direct_IO(rw, iocb, inode,
-		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-		   btrfs_submit_direct, 0);
-
-	if (ret < 0 && ret != -EIOCBQUEUED) {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-			      offset + iov_length(iov, nr_segs) - 1,
-			      EXTENT_LOCKED | write_bits, 1, 0,
-			      &cached_state, GFP_NOFS);
-	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
-		/*
-		 * We're falling back to buffered, unlock the section we didn't
-		 * do IO on.
-		 */
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
-			      offset + iov_length(iov, nr_segs) - 1,
-			      EXTENT_LOCKED | write_bits, 1, 0,
-			      &cached_state, GFP_NOFS);
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+			iter, offset, btrfs_get_blocks_direct, NULL,
+			btrfs_submit_direct, flags);
+	if (rw & WRITE) {
+		if (ret < 0 && ret != -EIOCBQUEUED)
+			btrfs_delalloc_release_space(inode, count);
+		else if (ret >= 0 && (size_t)ret < count)
+			btrfs_delalloc_release_space(inode,
+						     count - (size_t)ret);
+		else
+			btrfs_delalloc_release_metadata(inode, 0);
 	}
 out:
-	free_extent_state(cached_state);
+	if (wakeup)
+		inode_dio_done(inode);
+	if (relock)
+		mutex_lock(&inode->i_mutex);
+
 	return ret;
 }
 
+#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
+	int	ret;
+
+	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 
@@ -6231,7 +7584,7 @@ int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btrfs_get_extent);
+	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
 }
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6248,8 +7601,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-int btrfs_writepages(struct address_space *mapping,
-		     struct writeback_control *wbc)
+static int btrfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
 
@@ -6290,14 +7643,16 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
+	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-
+	int inode_evicting = inode->i_state & I_FREEING;
 
 	/*
 	 * we have the page locked, so new writeback can't start,
@@ -6308,41 +7663,65 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	 */
 	wait_on_page_writeback(page);
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	tree = &BTRFS_I(inode)->io_tree;
 	if (offset) {
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
-	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
-					   page_offset(page));
+
+	if (!inode_evicting)
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
 		/*
 		 * IO on this page will never be started, so we need
 		 * to account for any ordered extents now
 		 */
-		clear_extent_bit(tree, page_start, page_end,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 &cached_state, GFP_NOFS);
+		if (!inode_evicting)
+			clear_extent_bit(tree, page_start, page_end,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, &cached_state,
+					 GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
 		 */
 		if (TestClearPagePrivate2(page)) {
-			btrfs_finish_ordered_io(page->mapping->host,
-						page_start, page_end);
+			struct btrfs_ordered_inode_tree *tree;
+			u64 new_len;
+
+			tree = &BTRFS_I(inode)->ordered_tree;
+
+			spin_lock_irq(&tree->lock);
+			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+			new_len = page_start - ordered->file_offset;
+			if (new_len < ordered->truncated_len)
+				ordered->truncated_len = new_len;
+			spin_unlock_irq(&tree->lock);
+
+			if (btrfs_dec_test_ordered_pending(inode, &ordered,
+							   page_start,
+							   PAGE_CACHE_SIZE, 1))
+				btrfs_finish_ordered_io(ordered);
 		}
 		btrfs_put_ordered_extent(ordered);
-		cached_state = NULL;
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-				 GFP_NOFS);
+		if (!inode_evicting) {
+			cached_state = NULL;
+			lock_extent_bits(tree, page_start, page_end, 0,
+					 &cached_state);
+		}
+	}
+
+	if (!inode_evicting) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+
+		__btrfs_releasepage(page, GFP_NOFS);
 	}
-	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
-	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
 	if (PagePrivate(page)) {
@@ -6370,7 +7749,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = fdentry(vma->vm_file)->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -6379,16 +7758,24 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	unsigned long zero_start;
 	loff_t size;
 	int ret;
+	int reserved = 0;
 	u64 page_start;
 	u64 page_end;
 
+	sb_start_pagefault(inode->i_sb);
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (!ret) {
+		ret = file_update_time(vma->vm_file);
+		reserved = 1;
+	}
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
 		else /* -ENOSPC, -EIO, etc */
 			ret = VM_FAULT_SIGBUS;
-		goto out;
+		if (reserved)
+			goto out;
+		goto out_noreserve;
 	}
 
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6405,8 +7792,7 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
 	set_page_extent_mapped(page);
 
 	/*
@@ -6431,7 +7817,8 @@ again:
 	 * prepare_pages in the normal write path.
 	 */
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6462,42 +7849,94 @@ again:
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
-	if (!ret)
+	if (!ret) {
+		sb_end_pagefault(inode->i_sb);
 		return VM_FAULT_LOCKED;
+	}
 	unlock_page(page);
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
+	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
 
-static void btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
+	struct btrfs_block_rsv *rsv;
+	int ret = 0;
+	int err = 0;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	if (!S_ISREG(inode->i_mode)) {
-		WARN_ON(1);
-		return;
-	}
-
-	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+				       (u64)-1);
 	if (ret)
-		return;
+		return ret;
 
-	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
-	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+	/*
+	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+	 * 3 things going on here
+	 *
+	 * 1) We need to reserve space for our orphan item and the space to
+	 * delete our orphan item.  Lord knows we don't want to have a dangling
+	 * orphan item because we didn't reserve space to remove it.
+	 *
+	 * 2) We need to reserve space to update our inode.
+	 *
+	 * 3) We need to have something to cache all the space that is going to
+	 * be free'd up by the truncate operation, but also have some slack
+	 * space reserved in case it uses space during the truncate (thank you
+	 * very much snapshotting).
+	 *
+	 * And we need these to all be seperate.  The fact is we can use alot of
+	 * space doing the truncate, and we have no earthly idea how much space
+	 * we will use, so we need the truncate reservation to be seperate so it
+	 * doesn't end up using space reserved for updating the inode or
+	 * removing the orphan item.  We also need to be able to stop the
+	 * transaction and start a new one, which means we need to be able to
+	 * update the inode several times, and we have no idea of knowing how
+	 * many times that will be, so we can't just reserve 1 item for the
+	 * entirety of the opration, so that has to be done seperately as well.
+	 * Then there is the orphan item, which does indeed need to be held on
+	 * to for the whole operation, and we need nobody to touch this reserved
+	 * space except the orphan code.
+	 *
+	 * So that leaves us with
+	 *
+	 * 1) root->orphan_block_rsv - for the orphan deletion.
+	 * 2) rsv - for the truncate reservation, which we will steal from the
+	 * transaction reservation.
+	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+	 * updating the inode.
+	 */
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv)
+		return -ENOMEM;
+	rsv->size = min_size;
+	rsv->failfast = 1;
+
+	/*
+	 * 1 for the truncate slack space
+	 * 1 for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
 
-	trans = btrfs_start_transaction(root, 0);
-	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
-	trans->block_rsv = root->orphan_block_rsv;
+	/* Migrate the slack space for the truncate to our reserve */
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
 
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
@@ -6516,54 +7955,76 @@ static void btrfs_truncate(struct inode *inode)
 	 * using truncate to replace the contents of the file will
 	 * end up with a zero length file after a crash.
 	 */
-	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+					   &BTRFS_I(inode)->runtime_flags))
 		btrfs_add_ordered_operation(trans, root, inode);
 
-	while (1) {
-		if (!trans) {
-			trans = btrfs_start_transaction(root, 0);
-			BUG_ON(IS_ERR(trans));
-			btrfs_set_trans_block_group(trans, inode);
-			trans->block_rsv = root->orphan_block_rsv;
-		}
-
-		ret = btrfs_block_rsv_check(trans, root,
-					    root->orphan_block_rsv, 0, 5);
-		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			trans = NULL;
-			continue;
-		}
+	/*
+	 * So if we truncate and then write and fsync we normally would just
+	 * write the extents that changed, which is a problem if we need to
+	 * first truncate that entire inode.  So set this flag so we write out
+	 * all of the extents in the inode to the sync log so we're completely
+	 * safe.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+	trans->block_rsv = rsv;
 
+	while (1) {
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
-		if (ret != -EAGAIN)
+		if (ret != -ENOSPC) {
+			err = ret;
 			break;
+		}
 
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		ret = btrfs_update_inode(trans, root, inode);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			break;
+		}
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		trans = NULL;
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
+
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {
+		trans->block_rsv = root->orphan_block_rsv;
 		ret = btrfs_orphan_del(trans, inode);
-		BUG_ON(ret);
+		if (ret)
+			err = ret;
 	}
 
-	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
+	if (trans) {
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret && !err)
+			err = ret;
 
-	nr = trans->blocks_used;
-	ret = btrfs_end_transaction_throttle(trans, root);
-	BUG_ON(ret);
-	btrfs_btree_balance_dirty(root, nr);
+		ret = btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
+	}
+
+out:
+	btrfs_free_block_rsv(root, rsv);
+
+	if (ret && !err)
+		err = ret;
+
+	return err;
 }
 
 /*
@@ -6571,40 +8032,35 @@ static void btrfs_truncate(struct inode *inode)
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
-			     u64 new_dirid, u64 alloc_hint)
+			     struct btrfs_root *parent_root,
+			     u64 new_dirid)
 {
 	struct inode *inode;
 	int err;
 	u64 index = 0;
 
-	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
+				new_dirid, new_dirid,
+				S_IFDIR | (~current_umask() & S_IRWXUGO),
+				&index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
-	inode->i_nlink = 1;
+	set_nlink(inode, 1);
 	btrfs_i_size_write(inode, 0);
 
+	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+	if (err)
+		btrfs_err(new_root->fs_info,
+			  "error inheriting subvolume %llu properties: %d",
+			  new_root->root_key.objectid, err);
+
 	err = btrfs_update_inode(trans, new_root, inode);
-	BUG_ON(err);
 
 	iput(inode);
-	return 0;
-}
-
-/* helper function for file defrag and space balancing.  This
- * forces readahead on a given range of bytes in an inode
- */
-unsigned long btrfs_force_ra(struct address_space *mapping,
-			      struct file_ra_state *ra, struct file *file,
-			      pgoff_t offset, pgoff_t last_index)
-{
-	pgoff_t req_size = last_index - offset + 1;
-
-	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
-	return offset + req_size;
+	return err;
 }
 
 struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -6617,35 +8073,38 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 		return NULL;
 
 	ei->root = NULL;
-	ei->space_info = NULL;
 	ei->generation = 0;
-	ei->sequence = 0;
 	ei->last_trans = 0;
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
 	ei->delalloc_bytes = 0;
-	ei->reserved_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
+	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
+	ei->dir_index = 0;
 	ei->last_unlink_trans = 0;
+	ei->last_log_commit = 0;
 
-	spin_lock_init(&ei->accounting_lock);
-	atomic_set(&ei->outstanding_extents, 0);
+	spin_lock_init(&ei->lock);
+	ei->outstanding_extents = 0;
 	ei->reserved_extents = 0;
 
-	ei->ordered_data_close = 0;
-	ei->orphan_meta_reserved = 0;
-	ei->dummy_inode = 0;
+	ei->runtime_flags = 0;
 	ei->force_compress = BTRFS_COMPRESS_NONE;
 
+	ei->delayed_node = NULL;
+
 	inode = &ei->vfs_inode;
-	extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
-	extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
-	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+	extent_map_tree_init(&ei->extent_tree);
+	extent_io_tree_init(&ei->io_tree, &inode->i_data);
+	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+	ei->io_tree.track_uptodate = 1;
+	ei->io_failure_tree.track_uptodate = 1;
+	atomic_set(&ei->sync_writers, 0);
 	mutex_init(&ei->log_mutex);
+	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->ordered_operations);
 	RB_CLEAR_NODE(&ei->rb_node);
@@ -6653,10 +8112,17 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
 static void btrfs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
@@ -6665,10 +8131,12 @@ void btrfs_destroy_inode(struct inode *inode)
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	WARN_ON(!list_empty(&inode->i_dentry));
+	WARN_ON(!hlist_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
-	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+	WARN_ON(BTRFS_I(inode)->outstanding_extents);
 	WARN_ON(BTRFS_I(inode)->reserved_extents);
+	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+	WARN_ON(BTRFS_I(inode)->csum_bytes);
 
 	/*
 	 * This can happen where we create an inode, but somebody else also
@@ -6684,43 +8152,25 @@ void btrfs_destroy_inode(struct inode *inode)
 	 */
 	smp_mb();
 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->fs_info->ordered_root_lock);
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 	}
 
-	if (root == root->fs_info->tree_root) {
-		struct btrfs_block_group_cache *block_group;
-
-		block_group = btrfs_lookup_block_group(root->fs_info,
-						BTRFS_I(inode)->block_group);
-		if (block_group && block_group->inode == inode) {
-			spin_lock(&block_group->lock);
-			block_group->inode = NULL;
-			spin_unlock(&block_group->lock);
-			btrfs_put_block_group(block_group);
-		} else if (block_group) {
-			btrfs_put_block_group(block_group);
-		}
+	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
+			btrfs_ino(inode));
+		atomic_dec(&root->orphan_inodes);
 	}
 
-	spin_lock(&root->orphan_lock);
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
-		       inode->i_ino);
-		list_del_init(&BTRFS_I(inode)->i_orphan);
-	}
-	spin_unlock(&root->orphan_lock);
-
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
 			break;
 		else {
-			printk(KERN_ERR "btrfs found ordered "
-			       "extent %llu %llu on inode cleanup\n",
-			       (unsigned long long)ordered->file_offset,
-			       (unsigned long long)ordered->len);
+			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
+				ordered->file_offset, ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -6736,8 +8186,11 @@ int btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root)
+	if (root == NULL)
+		return 1;
+
+	/* the snap/subvol tree is on deleting */
+	if (btrfs_root_refs(&root->root_item) == 0)
 		return 1;
 	else
 		return generic_drop_inode(inode);
@@ -6752,6 +8205,11 @@ static void init_once(void *foo)
 
 void btrfs_destroy_cachep(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	if (btrfs_inode_cachep)
 		kmem_cache_destroy(btrfs_inode_cachep);
 	if (btrfs_trans_handle_cachep)
@@ -6760,34 +8218,51 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_transaction_cachep);
 	if (btrfs_path_cachep)
 		kmem_cache_destroy(btrfs_path_cachep);
+	if (btrfs_free_space_cachep)
+		kmem_cache_destroy(btrfs_free_space_cachep);
+	if (btrfs_delalloc_work_cachep)
+		kmem_cache_destroy(btrfs_delalloc_work_cachep);
 }
 
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
 			sizeof(struct btrfs_trans_handle), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
 
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
 			sizeof(struct btrfs_transaction), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
 
-	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+	btrfs_path_cachep = kmem_cache_create("btrfs_path",
 			sizeof(struct btrfs_path), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
 
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+			sizeof(struct btrfs_free_space), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_free_space_cachep)
+		goto fail;
+
+	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+			sizeof(struct btrfs_delalloc_work), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+			NULL);
+	if (!btrfs_delalloc_work_cachep)
+		goto fail;
+
 	return 0;
 fail:
 	btrfs_destroy_cachep();
@@ -6797,12 +8272,19 @@ fail:
 static int btrfs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
+	u64 delalloc_bytes;
 	struct inode *inode = dentry->d_inode;
+	u32 blocksize = inode->i_sb->s_blocksize;
+
 	generic_fillattr(inode, stat);
-	stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+	stat->dev = BTRFS_I(inode)->root->anon_dev;
 	stat->blksize = PAGE_CACHE_SIZE;
-	stat->blocks = (inode_get_bytes(inode) +
-			BTRFS_I(inode)->delalloc_bytes) >> 9;
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+	spin_unlock(&BTRFS_I(inode)->lock);
+	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+			ALIGN(delalloc_bytes, blocksize)) >> 9;
 	return 0;
 }
 
@@ -6818,21 +8300,43 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	u64 index = 0;
 	u64 root_objectid;
 	int ret;
+	u64 old_ino = btrfs_ino(old_inode);
 
-	if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 		return -EPERM;
 
 	/* we only allow rename subvolume link between subvolumes */
-	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 		return -EXDEV;
 
-	if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
-	    (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
 		return -ENOTEMPTY;
 
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+
+
+	/* check for collisions, even if the  name isn't there */
+	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len);
+
+	if (ret) {
+		if (ret == -EEXIST) {
+			/* we shouldn't get
+			 * eexist without a new_inode */
+			if (WARN_ON(!new_inode)) {
+				return ret;
+			}
+		} else {
+			/* maybe -EOVERFLOW */
+			return ret;
+		}
+	}
+	ret = 0;
+
 	/*
 	 * we're using rename to replace one file with another.
 	 * and the replacement file is large.  Start IO on it now so
@@ -6843,7 +8347,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		filemap_flush(old_inode->i_mapping);
 
 	/* close the racy window with snapshot create/destroy ioctl */
-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		down_read(&root->fs_info->subvol_sem);
 	/*
 	 * We want to reserve the absolute worst case amount of items.  So if
@@ -6853,11 +8357,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
 	 * should cover the worst case number of items we'll modify.
 	 */
-	trans = btrfs_start_transaction(root, 20);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
-
-	btrfs_set_trans_block_group(trans, new_dir);
+	trans = btrfs_start_transaction(root, 11);
+	if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_notrans;
+        }
 
 	if (dest != root)
 		btrfs_record_root_in_trans(trans, dest);
@@ -6866,15 +8370,16 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_fail;
 
-	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+	BTRFS_I(old_inode)->dir_index = 0ULL;
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		/* force full log commit if subvolume involved. */
-		root->fs_info->last_trans_log_full_commit = trans->transid;
+		btrfs_set_log_full_commit(root->fs_info, trans);
 	} else {
 		ret = btrfs_insert_inode_ref(trans, dest,
 					     new_dentry->d_name.name,
 					     new_dentry->d_name.len,
-					     old_inode->i_ino,
-					     new_dir->i_ino, index);
+					     old_ino,
+					     btrfs_ino(new_dir), index);
 		if (ret)
 			goto out_fail;
 		/*
@@ -6890,11 +8395,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * make sure the inode gets flushed if it is replacing
 	 * something.
 	 */
-	if (new_inode && new_inode->i_size &&
-	    old_inode && S_ISREG(old_inode->i_mode)) {
+	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
 		btrfs_add_ordered_operation(trans, root, old_inode);
-	}
 
+	inode_inc_iversion(old_dir);
+	inode_inc_iversion(new_dir);
+	inode_inc_iversion(old_inode);
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
@@ -6902,23 +8408,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_dentry->d_parent != new_dentry->d_parent)
 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
 
-	if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
 					old_dentry->d_name.name,
 					old_dentry->d_name.len);
 	} else {
-		btrfs_inc_nlink(old_dentry->d_inode);
-		ret = btrfs_unlink_inode(trans, root, old_dir,
-					 old_dentry->d_inode,
-					 old_dentry->d_name.name,
-					 old_dentry->d_name.len);
+		ret = __btrfs_unlink_inode(trans, root, old_dir,
+					old_dentry->d_inode,
+					old_dentry->d_name.name,
+					old_dentry->d_name.len);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, old_inode);
+	}
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
 	}
-	BUG_ON(ret);
 
 	if (new_inode) {
+		inode_inc_iversion(new_inode);
 		new_inode->i_ctime = CURRENT_TIME;
-		if (unlikely(new_inode->i_ino ==
+		if (unlikely(btrfs_ino(new_inode) ==
 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 			root_objectid = BTRFS_I(new_inode)->location.objectid;
 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
@@ -6932,67 +8443,168 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 						 new_dentry->d_name.name,
 						 new_dentry->d_name.len);
 		}
-		BUG_ON(ret);
-		if (new_inode->i_nlink == 0) {
+		if (!ret && new_inode->i_nlink == 0)
 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
-			BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out_fail;
 		}
 	}
 
 	ret = btrfs_add_link(trans, new_dir, old_inode,
 			     new_dentry->d_name.name,
 			     new_dentry->d_name.len, 0, index);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
 
-	if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
-		struct dentry *parent = dget_parent(new_dentry);
+	if (old_inode->i_nlink == 1)
+		BTRFS_I(old_inode)->dir_index = index;
+
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		struct dentry *parent = new_dentry->d_parent;
 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
-		dput(parent);
 		btrfs_end_log_trans(root);
 	}
 out_fail:
-	btrfs_end_transaction_throttle(trans, root);
-
-	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+	btrfs_end_transaction(trans, root);
+out_notrans:
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
 
 	return ret;
 }
 
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+	struct btrfs_delalloc_work *delalloc_work;
+	struct inode *inode;
+
+	delalloc_work = container_of(work, struct btrfs_delalloc_work,
+				     work);
+	inode = delalloc_work->inode;
+	if (delalloc_work->wait) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	} else {
+		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
+
+	if (delalloc_work->delay_iput)
+		btrfs_add_delayed_iput(inode);
+	else
+		iput(inode);
+	complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput)
+{
+	struct btrfs_delalloc_work *work;
+
+	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+	if (!work)
+		return NULL;
+
+	init_completion(&work->completion);
+	INIT_LIST_HEAD(&work->list);
+	work->inode = inode;
+	work->wait = wait;
+	work->delay_iput = delay_iput;
+	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+
+	return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+	wait_for_completion(&work->completion);
+	kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
 /*
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+				   int nr)
 {
-	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
 	struct inode *inode;
+	struct btrfs_delalloc_work *work, *next;
+	struct list_head works;
+	struct list_head splice;
+	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	INIT_LIST_HEAD(&works);
+	INIT_LIST_HEAD(&splice);
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	while (!list_empty(head)) {
-		binode = list_entry(head->next, struct btrfs_inode,
+	mutex_lock(&root->delalloc_mutex);
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
+	while (!list_empty(&splice)) {
+		binode = list_entry(splice.next, struct btrfs_inode,
 				    delalloc_inodes);
+
+		list_move_tail(&binode->delalloc_inodes,
+			       &root->delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
-		if (!inode)
-			list_del_init(&binode->delalloc_inodes);
-		spin_unlock(&root->fs_info->delalloc_lock);
-		if (inode) {
-			filemap_flush(inode->i_mapping);
+		if (!inode) {
+			cond_resched_lock(&root->delalloc_lock);
+			continue;
+		}
+		spin_unlock(&root->delalloc_lock);
+
+		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+		if (unlikely(!work)) {
 			if (delay_iput)
 				btrfs_add_delayed_iput(inode);
 			else
 				iput(inode);
+			ret = -ENOMEM;
+			goto out;
 		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
+		ret++;
+		if (nr != -1 && ret >= nr)
+			goto out;
 		cond_resched();
-		spin_lock(&root->fs_info->delalloc_lock);
+		spin_lock(&root->delalloc_lock);
 	}
-	spin_unlock(&root->fs_info->delalloc_lock);
+	spin_unlock(&root->delalloc_lock);
 
-	/* the filemap_flush will queue IO into the worker threads, but
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&root->delalloc_lock);
+		list_splice_tail(&splice, &root->delalloc_inodes);
+		spin_unlock(&root->delalloc_lock);
+	}
+	mutex_unlock(&root->delalloc_mutex);
+	return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+	int ret;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		return -EROFS;
+
+	ret = __start_delalloc_inodes(root, delay_iput, -1);
+	if (ret > 0)
+		ret = 0;
+	/*
+	 * the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
 	 * ordered extents get created before we return
 	 */
@@ -7004,59 +8616,63 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
-	return 0;
+	return ret;
 }
 
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
-				   int sync)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr)
 {
-	struct btrfs_inode *binode;
-	struct inode *inode = NULL;
+	struct btrfs_root *root;
+	struct list_head splice;
+	int ret;
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	while (!list_empty(&root->fs_info->delalloc_inodes)) {
-		binode = list_entry(root->fs_info->delalloc_inodes.next,
-				    struct btrfs_inode, delalloc_inodes);
-		inode = igrab(&binode->vfs_inode);
-		if (inode) {
-			list_move_tail(&binode->delalloc_inodes,
-				       &root->fs_info->delalloc_inodes);
-			break;
-		}
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		return -EROFS;
 
-		list_del_init(&binode->delalloc_inodes);
-		cond_resched_lock(&root->fs_info->delalloc_lock);
-	}
-	spin_unlock(&root->fs_info->delalloc_lock);
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&fs_info->delalloc_root_mutex);
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice) && nr) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->delalloc_root,
+			       &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		ret = __start_delalloc_inodes(root, delay_iput, nr);
+		btrfs_put_fs_root(root);
+		if (ret < 0)
+			goto out;
 
-	if (inode) {
-		if (sync) {
-			filemap_write_and_wait(inode->i_mapping);
-			/*
-			 * We have to do this because compression doesn't
-			 * actually set PG_writeback until it submits the pages
-			 * for IO, which happens in an async thread, so we could
-			 * race and not actually wait for any writeback pages
-			 * because they've not been submitted yet.  Technically
-			 * this could still be the case for the ordered stuff
-			 * since the async thread may not have started to do its
-			 * work yet.  If this becomes the case then we need to
-			 * figure out a way to make sure that in writepage we
-			 * wait for any async pages to be submitted before
-			 * returning so that fdatawait does what its supposed to
-			 * do.
-			 */
-			btrfs_wait_ordered_range(inode, 0, (u64)-1);
-		} else {
-			filemap_flush(inode->i_mapping);
+		if (nr != -1) {
+			nr -= ret;
+			WARN_ON(nr < 0);
 		}
-		if (delay_iput)
-			btrfs_add_delayed_iput(inode);
-		else
-			iput(inode);
-		return 1;
+		spin_lock(&fs_info->delalloc_root_lock);
 	}
-	return 0;
+	spin_unlock(&fs_info->delalloc_root_lock);
+
+	ret = 0;
+	atomic_inc(&fs_info->async_submit_draining);
+	while (atomic_read(&fs_info->nr_async_submits) ||
+	      atomic_read(&fs_info->async_delalloc_pages)) {
+		wait_event(fs_info->async_submit_wait,
+		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&fs_info->async_submit_draining);
+out:
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&fs_info->delalloc_root_lock);
+		list_splice_tail(&splice, &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+	}
+	mutex_unlock(&fs_info->delalloc_root_mutex);
+	return ret;
 }
 
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7070,21 +8686,17 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	u64 index = 0 ;
+	u64 index = 0;
 	int name_len;
 	int datasize;
 	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
 	struct extent_buffer *leaf;
-	unsigned long nr = 0;
 
-	name_len = strlen(symname) + 1;
+	name_len = strlen(symname);
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-	if (err)
-		return err;
 	/*
 	 * 2 items for inode item and ref
 	 * 2 items for dir items
@@ -7094,15 +8706,17 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
+	err = btrfs_find_free_ino(root, &objectid);
+	if (err)
+		goto out_unlock;
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
-				&index);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
+				dentry->d_name.len, btrfs_ino(dir), objectid,
+				S_IFLNK|S_IRWXUGO, &index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
 		goto out_unlock;
+	}
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err) {
@@ -7110,25 +8724,33 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
+	/*
+	* If the active LSM wants to access the inode during
+	* d_instantiate it needs these. Smack checks to see
+	* if the filesystem supports xattrs by looking at the
+	* ops vector.
+	*/
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 	if (drop_inode)
 		goto out_unlock;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	key.objectid = inode->i_ino;
+	if (!path) {
+		err = -ENOMEM;
+		drop_inode = 1;
+		goto out_unlock;
+	}
+	key.objectid = btrfs_ino(inode);
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 	datasize = btrfs_file_extent_calc_inline_size(name_len);
@@ -7136,6 +8758,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				      datasize);
 	if (err) {
 		drop_inode = 1;
+		btrfs_free_path(path);
 		goto out_unlock;
 	}
 	leaf = path->nodes[0];
@@ -7158,19 +8781,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode_set_bytes(inode, name_len);
-	btrfs_i_size_write(inode, name_len - 1);
+	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		drop_inode = 1;
 
 out_unlock:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	if (!err)
+		d_instantiate(dentry, inode);
+	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -7179,10 +8803,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 				       loff_t actual_len, u64 *alloc_hint,
 				       struct btrfs_trans_handle *trans)
 {
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
 	u64 i_size;
+	u64 cur_bytes;
 	int ret = 0;
 	bool own_trans = true;
 
@@ -7197,8 +8824,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 			}
 		}
 
-		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
-					   0, *alloc_hint, (u64)-1, &ins, 1);
+		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+		cur_bytes = max(cur_bytes, min_size);
+		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+					   *alloc_hint, &ins, 1, 0);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
@@ -7210,14 +8839,52 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 						  ins.offset, ins.offset,
 						  ins.offset, 0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_free_reserved_extent(root, ins.objectid,
+						   ins.offset, 0);
+			btrfs_abort_transaction(trans, root, ret);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
 
+		em = alloc_extent_map();
+		if (!em) {
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+			goto next;
+		}
+
+		em->start = cur_offset;
+		em->orig_start = cur_offset;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		em->generation = trans->transid;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em, 1);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST)
+				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + ins.offset - 1,
+						0);
+		}
+		free_extent_map(em);
+next:
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		*alloc_hint = ins.objectid + ins.offset;
 
+		inode_inc_iversion(inode);
 		inode->i_ctime = CURRENT_TIME;
 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7232,7 +8899,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		}
 
 		ret = btrfs_update_inode(trans, root, inode);
-		BUG_ON(ret);
+
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
 
 		if (own_trans)
 			btrfs_end_transaction(trans, root);
@@ -7263,15 +8936,79 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
+static int btrfs_permission(struct inode *inode, int mask)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	umode_t mode = inode->i_mode;
 
-	if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
-		return -EROFS;
-	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
-		return -EACCES;
-	return generic_permission(inode, mask, flags, btrfs_check_acl);
+	if (mask & MAY_WRITE &&
+	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+		if (btrfs_root_readonly(root))
+			return -EROFS;
+		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+			return -EACCES;
+	}
+	return generic_permission(inode, mask);
+}
+
+static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	u64 objectid;
+	u64 index;
+	int ret = 0;
+
+	/*
+	 * 5 units required for adding orphan entry
+	 */
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	ret = btrfs_find_free_ino(root, &objectid);
+	if (ret)
+		goto out;
+
+	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+				btrfs_ino(dir), objectid, mode, &index);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		inode = NULL;
+		goto out;
+	}
+
+	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+	if (ret)
+		goto out;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		goto out;
+
+	inode->i_fop = &btrfs_file_operations;
+	inode->i_op = &btrfs_file_inode_operations;
+
+	inode->i_mapping->a_ops = &btrfs_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+	ret = btrfs_orphan_add(trans, inode);
+	if (ret)
+		goto out;
+
+	d_tmpfile(dentry, inode);
+	mark_inode_dirty(inode);
+
+out:
+	btrfs_end_transaction(trans, root);
+	if (ret)
+		iput(inode);
+	btrfs_balance_delayed_items(root);
+	btrfs_btree_balance_dirty(root);
+
+	return ret;
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7291,16 +9028,23 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
+	.tmpfile        = btrfs_tmpfile,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
 };
 
 static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= btrfs_real_readdir,
+	.iterate	= btrfs_real_readdir,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
@@ -7316,7 +9060,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 	.merge_extent_hook = btrfs_merge_extent_hook,
@@ -7340,7 +9083,6 @@ static const struct address_space_operations btrfs_aops = {
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
-	.sync_page	= block_sync_page,
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
@@ -7356,7 +9098,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
 };
 
 static const struct inode_operations btrfs_file_inode_operations = {
-	.truncate	= btrfs_truncate,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.setxattr	= btrfs_setxattr,
@@ -7365,6 +9106,9 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.fiemap		= btrfs_fiemap,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
@@ -7374,19 +9118,25 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
+	.get_acl	= btrfs_get_acl,
+	.set_acl	= btrfs_set_acl,
+	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.setxattr	= btrfs_setxattr,
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
+	.update_time	= btrfs_update_time,
 };
 
 const struct dentry_operations btrfs_dentry_operations = {
 	.d_delete	= btrfs_dentry_delete,
+	.d_release	= btrfs_dentry_release,
 };
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d1bace3df9b..47aceb494d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,15 +40,54 @@
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
-#include "compat.h"
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include <linux/btrfs.h>
+#include <linux/uaccess.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
+#include "inode-map.h"
+#include "backref.h"
+#include "rcu-string.h"
+#include "send.h"
+#include "dev-replace.h"
+#include "props.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+	__u64 sec;
+	__u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+	char	uuid[BTRFS_UUID_SIZE];	/* in */
+	__u64	stransid;		/* in */
+	__u64	rtransid;		/* out */
+	struct btrfs_ioctl_timespec_32 stime; /* in */
+	struct btrfs_ioctl_timespec_32 rtime; /* out */
+	__u64	flags;			/* in */
+	__u64	reserved[16];		/* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+				struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -80,6 +119,13 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
 		iflags |= FS_NOATIME_FL;
 	if (flags & BTRFS_INODE_DIRSYNC)
 		iflags |= FS_DIRSYNC_FL;
+	if (flags & BTRFS_INODE_NODATACOW)
+		iflags |= FS_NOCOW_FL;
+
+	if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
+		iflags |= FS_COMPR_FL;
+	else if (flags & BTRFS_INODE_NOCOMPRESS)
+		iflags |= FS_NOCOMP_FL;
 
 	return iflags;
 }
@@ -90,25 +136,28 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
 void btrfs_update_iflags(struct inode *inode)
 {
 	struct btrfs_inode *ip = BTRFS_I(inode);
-
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	unsigned int new_fl = 0;
 
 	if (ip->flags & BTRFS_INODE_SYNC)
-		inode->i_flags |= S_SYNC;
+		new_fl |= S_SYNC;
 	if (ip->flags & BTRFS_INODE_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
+		new_fl |= S_IMMUTABLE;
 	if (ip->flags & BTRFS_INODE_APPEND)
-		inode->i_flags |= S_APPEND;
+		new_fl |= S_APPEND;
 	if (ip->flags & BTRFS_INODE_NOATIME)
-		inode->i_flags |= S_NOATIME;
+		new_fl |= S_NOATIME;
 	if (ip->flags & BTRFS_INODE_DIRSYNC)
-		inode->i_flags |= S_DIRSYNC;
+		new_fl |= S_DIRSYNC;
+
+	set_mask_bits(&inode->i_flags,
+		      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+		      new_fl);
 }
 
 /*
  * Inherit flags from the parent inode.
  *
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
  */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
@@ -119,18 +168,26 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
 	flags = BTRFS_I(dir)->flags;
 
-	if (S_ISREG(inode->i_mode))
-		flags &= ~BTRFS_INODE_DIRSYNC;
-	else if (!S_ISDIR(inode->i_mode))
-		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+	if (flags & BTRFS_INODE_NOCOMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+	} else if (flags & BTRFS_INODE_COMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	}
+
+	if (flags & BTRFS_INODE_NODATACOW) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+		if (S_ISREG(inode->i_mode))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+	}
 
-	BTRFS_I(inode)->flags = flags;
 	btrfs_update_iflags(inode);
 }
 
 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 {
-	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+	struct btrfs_inode *ip = BTRFS_I(file_inode(file));
 	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
 
 	if (copy_to_user(arg, &flags, sizeof(flags)))
@@ -138,14 +195,35 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 	return 0;
 }
 
+static int check_flags(unsigned int flags)
+{
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL | FS_DIRSYNC_FL | \
+		      FS_NOCOMP_FL | FS_COMPR_FL |
+		      FS_NOCOW_FL))
+		return -EOPNOTSUPP;
+
+	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_inode *ip = BTRFS_I(inode);
 	struct btrfs_root *root = ip->root;
 	struct btrfs_trans_handle *trans;
 	unsigned int flags, oldflags;
 	int ret;
+	u64 ip_oldflags;
+	unsigned int i_oldflags;
+	umode_t mode;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -153,16 +231,20 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	if (copy_from_user(&flags, arg, sizeof(flags)))
 		return -EFAULT;
 
-	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-		      FS_NOATIME_FL | FS_NODUMP_FL | \
-		      FS_SYNC_FL | FS_DIRSYNC_FL))
-		return -EOPNOTSUPP;
+	ret = check_flags(flags);
+	if (ret)
+		return ret;
 
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	mutex_lock(&inode->i_mutex);
 
+	ip_oldflags = ip->flags;
+	i_oldflags = inode->i_flags;
+	mode = inode->i_mode;
+
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -172,10 +254,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		}
 	}
 
-	ret = mnt_want_write(file->f_path.mnt);
-	if (ret)
-		goto out_unlock;
-
 	if (flags & FS_SYNC_FL)
 		ip->flags |= BTRFS_INODE_SYNC;
 	else
@@ -200,70 +278,204 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags |= BTRFS_INODE_DIRSYNC;
 	else
 		ip->flags &= ~BTRFS_INODE_DIRSYNC;
+	if (flags & FS_NOCOW_FL) {
+		if (S_ISREG(mode)) {
+			/*
+			 * It's safe to turn csums off here, no extents exist.
+			 * Otherwise we want the flag to reflect the real COW
+			 * status of the file and will not set it.
+			 */
+			if (inode->i_size == 0)
+				ip->flags |= BTRFS_INODE_NODATACOW
+					   | BTRFS_INODE_NODATASUM;
+		} else {
+			ip->flags |= BTRFS_INODE_NODATACOW;
+		}
+	} else {
+		/*
+		 * Revert back under same assuptions as above
+		 */
+		if (S_ISREG(mode)) {
+			if (inode->i_size == 0)
+				ip->flags &= ~(BTRFS_INODE_NODATACOW
+				             | BTRFS_INODE_NODATASUM);
+		} else {
+			ip->flags &= ~BTRFS_INODE_NODATACOW;
+		}
+	}
 
+	/*
+	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+	 * flag may be changed automatically if compression code won't make
+	 * things smaller.
+	 */
+	if (flags & FS_NOCOMP_FL) {
+		ip->flags &= ~BTRFS_INODE_COMPRESS;
+		ip->flags |= BTRFS_INODE_NOCOMPRESS;
+
+		ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+		if (ret && ret != -ENODATA)
+			goto out_drop;
+	} else if (flags & FS_COMPR_FL) {
+		const char *comp;
+
+		ip->flags |= BTRFS_INODE_COMPRESS;
+		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+
+		if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+			comp = "lzo";
+		else
+			comp = "zlib";
+		ret = btrfs_set_prop(inode, "btrfs.compression",
+				     comp, strlen(comp), 0);
+		if (ret)
+			goto out_drop;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
+	} else {
+		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+	}
 
-	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_drop;
+	}
 
 	btrfs_update_iflags(inode);
+	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, inode);
+
 	btrfs_end_transaction(trans, root);
+ out_drop:
+	if (ret) {
+		ip->flags = ip_oldflags;
+		inode->i_flags = i_oldflags;
+	}
 
-	mnt_drop_write(file->f_path.mnt);
  out_unlock:
 	mutex_unlock(&inode->i_mutex);
-	return 0;
+	mnt_drop_write_file(file);
+	return ret;
 }
 
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	return put_user(inode->i_generation, arg);
 }
 
-static noinline int create_subvol(struct btrfs_root *root,
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+	struct btrfs_device *device;
+	struct request_queue *q;
+	struct fstrim_range range;
+	u64 minlen = ULLONG_MAX;
+	u64 num_devices = 0;
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+				dev_list) {
+		if (!device->bdev)
+			continue;
+		q = bdev_get_queue(device->bdev);
+		if (blk_queue_discard(q)) {
+			num_devices++;
+			minlen = min((u64)q->limits.discard_granularity,
+				     minlen);
+		}
+	}
+	rcu_read_unlock();
+
+	if (!num_devices)
+		return -EOPNOTSUPP;
+	if (copy_from_user(&range, arg, sizeof(range)))
+		return -EFAULT;
+	if (range.start > total_bytes ||
+	    range.len < fs_info->sb->s_blocksize)
+		return -EINVAL;
+
+	range.len = min(range.len, total_bytes - range.start);
+	range.minlen = max(range.minlen, minlen);
+	ret = btrfs_trim_fs(fs_info->tree_root, &range);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(arg, &range, sizeof(range)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int btrfs_is_empty_uuid(u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_UUID_SIZE; i++) {
+		if (uuid[i])
+			return 0;
+	}
+	return 1;
+}
+
+static noinline int create_subvol(struct inode *dir,
 				  struct dentry *dentry,
 				  char *name, int namelen,
-				  u64 *async_transid)
+				  u64 *async_transid,
+				  struct btrfs_qgroup_inherit *inherit)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *new_root;
-	struct dentry *parent = dget_parent(dentry);
-	struct inode *dir;
+	struct btrfs_block_rsv block_rsv;
+	struct timespec cur_time = CURRENT_TIME;
+	struct inode *inode;
 	int ret;
 	int err;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	u64 index = 0;
+	u64 qgroup_reserved;
+	uuid_le new_uuid;
 
-	ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
-				       0, &objectid);
-	if (ret) {
-		dput(parent);
+	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
+	if (ret)
 		return ret;
-	}
-
-	dir = parent->d_inode;
 
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
-	 * 1 - inode item
-	 * 2 - refs
-	 * 1 - root item
-	 * 2 - dir items
+	 * The same as the snapshot creation, please see the comment
+	 * of create_snapshot().
 	 */
-	trans = btrfs_start_transaction(root, 6);
+	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       8, &qgroup_reserved, false);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		dput(parent);
-		return PTR_ERR(trans);
+		ret = PTR_ERR(trans);
+		btrfs_subvolume_release_metadata(root, &block_rsv,
+						 qgroup_reserved);
+		return ret;
 	}
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
+
+	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
+	if (ret)
+		goto fail;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
 				      0, objectid, NULL, 0, 0, 0);
@@ -278,21 +490,25 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    btrfs_header_chunk_tree_uuid(leaf),
 			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
+	memset(&root_item, 0, sizeof(root_item));
+
 	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+	btrfs_set_root_flags(&root_item, 0);
+	btrfs_set_root_limit(&root_item, 0);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
 	btrfs_set_root_generation(&root_item, trans->transid);
@@ -301,8 +517,15 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_root_used(&root_item, leaf->len);
 	btrfs_set_root_last_snapshot(&root_item, 0);
 
-	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
-	root_item.drop_level = 0;
+	btrfs_set_root_generation_v2(&root_item,
+			btrfs_root_generation(&root_item));
+	uuid_le_gen(&new_uuid);
+	memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
+	btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
+	root_item.ctime = root_item.otime;
+	btrfs_set_root_ctransid(&root_item, trans->transid);
+	btrfs_set_root_otransid(&root_item, trans->transid);
 
 	btrfs_tree_unlock(leaf);
 	free_extent_buffer(leaf);
@@ -320,23 +543,37 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	key.offset = (u64)-1;
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
-	BUG_ON(IS_ERR(new_root));
+	if (IS_ERR(new_root)) {
+		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+		ret = PTR_ERR(new_root);
+		goto fail;
+	}
 
 	btrfs_record_root_in_trans(trans, new_root);
 
-	ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
-				       BTRFS_I(dir)->block_group);
+	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+	if (ret) {
+		/* We potentially lose an unused inode item here */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
 	/*
 	 * insert the directory item
 	 */
 	ret = btrfs_set_inode_index(dir, &index);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	ret = btrfs_insert_dir_item(trans, root,
-				    name, namelen, dir->i_ino, &key,
+				    name, namelen, dir, &key,
 				    BTRFS_FT_DIR, index);
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
+	}
 
 	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
 	ret = btrfs_update_inode(trans, root, dir);
@@ -344,85 +581,185 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
 				 objectid, root->root_key.objectid,
-				 dir->i_ino, index, name, namelen);
-
+				 btrfs_ino(dir), index, name, namelen);
 	BUG_ON(ret);
 
-	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+	ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+				  root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  objectid);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+
 fail:
-	dput(parent);
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+
 	if (async_transid) {
 		*async_transid = trans->transid;
 		err = btrfs_commit_transaction_async(trans, root, 1);
+		if (err)
+			err = btrfs_commit_transaction(trans, root);
 	} else {
 		err = btrfs_commit_transaction(trans, root);
 	}
 	if (err && !ret)
 		ret = err;
+
+	if (!ret) {
+		inode = btrfs_lookup_dentry(dir, dentry);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+		d_instantiate(dentry, inode);
+	}
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-			   char *name, int namelen, u64 *async_transid,
-			   bool readonly)
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&root->subv_writers->wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&root->subv_writers->counter);
+		if (writers)
+			schedule();
+
+		finish_wait(&root->subv_writers->wait, &wait);
+	} while (writers);
+}
+
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+			   struct dentry *dentry, char *name, int namelen,
+			   u64 *async_transid, bool readonly,
+			   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *inode;
-	struct dentry *parent;
 	struct btrfs_pending_snapshot *pending_snapshot;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	if (!root->ref_cows)
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		return -EINVAL;
 
+	atomic_inc(&root->will_be_snapshoted);
+	smp_mb__after_atomic();
+	btrfs_wait_nocow_write(root);
+
+	ret = btrfs_start_delalloc_inodes(root, 0);
+	if (ret)
+		goto out;
+
+	btrfs_wait_ordered_extents(root, -1);
+
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot)
-		return -ENOMEM;
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+			     BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * 1 - parent dir inode
+	 * 2 - dir entries
+	 * 1 - root item
+	 * 2 - root ref/backref
+	 * 1 - root of snapshot
+	 * 1 - UUID item
+	 */
+	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+					&pending_snapshot->block_rsv, 8,
+					&pending_snapshot->qgroup_reserved,
+					false);
+	if (ret)
+		goto free;
 
-	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
 	pending_snapshot->readonly = readonly;
+	pending_snapshot->dir = dir;
+	pending_snapshot->inherit = inherit;
 
-	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fail;
 	}
 
-	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
-	BUG_ON(ret);
-
+	spin_lock(&root->fs_info->trans_lock);
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
+	spin_unlock(&root->fs_info->trans_lock);
 	if (async_transid) {
 		*async_transid = trans->transid;
 		ret = btrfs_commit_transaction_async(trans,
 				     root->fs_info->extent_root, 1);
+		if (ret)
+			ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_commit_transaction(trans,
 					       root->fs_info->extent_root);
 	}
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	ret = pending_snapshot->error;
 	if (ret)
 		goto fail;
 
-	btrfs_orphan_cleanup(pending_snapshot->snap);
+	ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+	if (ret)
+		goto fail;
+
+	/*
+	 * If orphan cleanup did remove any orphans, it means the tree was
+	 * modified and therefore the commit root is not the same as the
+	 * current root anymore. This is a problem, because send uses the
+	 * commit root and therefore can see inode items that don't exist
+	 * in the current root anymore, and for example make calls to
+	 * btrfs_iget, which will do tree lookups based on the current root
+	 * and not on the commit root. Those lookups will fail, returning a
+	 * -ESTALE error, and making send fail with that error. So make sure
+	 * a send does not see any orphans we have just removed, and that it
+	 * will see the same inodes regardless of whether a transaction
+	 * commit happened before it started (meaning that the commit root
+	 * will be the same as the current root) or not.
+	 */
+	if (readonly && pending_snapshot->snap->node !=
+	    pending_snapshot->snap->commit_root) {
+		trans = btrfs_join_transaction(pending_snapshot->snap);
+		if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
+			ret = PTR_ERR(trans);
+			goto fail;
+		}
+		if (!IS_ERR(trans)) {
+			ret = btrfs_commit_transaction(trans,
+						       pending_snapshot->snap);
+			if (ret)
+				goto fail;
+		}
+	}
 
-	parent = dget_parent(dentry);
-	inode = btrfs_lookup_dentry(parent->d_inode, dentry);
-	dput(parent);
+	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		goto fail;
 	}
-	BUG_ON(!inode);
+
 	d_instantiate(dentry, inode);
 	ret = 0;
 fail:
+	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+					 &pending_snapshot->block_rsv,
+					 pending_snapshot->qgroup_reserved);
+free:
 	kfree(pending_snapshot);
+out:
+	atomic_dec(&root->will_be_snapshoted);
 	return ret;
 }
 
@@ -432,13 +769,13 @@ fail:
 */
 static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
 {
-	uid_t fsuid = current_fsuid();
+	kuid_t fsuid = current_fsuid();
 
 	if (!(dir->i_mode & S_ISVTX))
 		return 0;
-	if (inode->i_uid == fsuid)
+	if (uid_eq(inode->i_uid, fsuid))
 		return 0;
-	if (dir->i_uid == fsuid)
+	if (uid_eq(dir->i_uid, fsuid))
 		return 0;
 	return !capable(CAP_FOWNER);
 }
@@ -463,7 +800,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
  *     nfs_async_unlink().
  */
 
-static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 {
 	int error;
 
@@ -471,7 +808,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return -ENOENT;
 
 	BUG_ON(victim->d_parent->d_inode != dir);
-	audit_inode_child(victim, dir);
+	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
@@ -514,13 +851,16 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
-				   u64 *async_transid, bool readonly)
+				   u64 *async_transid, bool readonly,
+				   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
 	int error;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (error == -EINTR)
+		return error;
 
 	dentry = lookup_one_len(name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
@@ -531,13 +871,19 @@ static noinline int btrfs_mksubvol(struct path *parent,
 	if (dentry->d_inode)
 		goto out_dput;
 
-	error = mnt_want_write(parent->mnt);
+	error = btrfs_may_create(dir, dentry);
 	if (error)
 		goto out_dput;
 
-	error = btrfs_may_create(dir, dentry);
+	/*
+	 * even if this name doesn't exist, we may get hash collisions.
+	 * check for them now when we can safely fail
+	 */
+	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+					       dir->i_ino, name,
+					       namelen);
 	if (error)
-		goto out_drop_write;
+		goto out_dput;
 
 	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 
@@ -545,18 +891,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry,
-					name, namelen, async_transid, readonly);
+		error = create_snapshot(snap_src, dir, dentry, name, namelen,
+					async_transid, readonly, inherit);
 	} else {
-		error = create_subvol(BTRFS_I(dir)->root, dentry,
-				      name, namelen, async_transid);
+		error = create_subvol(dir, dentry, name, namelen,
+				      async_transid, inherit);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
 out_up_read:
 	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
-out_drop_write:
-	mnt_drop_write(parent->mnt);
 out_dput:
 	dput(dentry);
 out_unlock:
@@ -564,27 +908,115 @@ out_unlock:
 	return error;
 }
 
-static int should_defrag_range(struct inode *inode, u64 start, u64 len,
-			       int thresh, u64 *last_len, u64 *skip,
-			       u64 *defrag_end)
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
 {
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	int ret = 1;
+	u64 end;
 
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	read_unlock(&em_tree->lock);
 
-	if (thresh == 0)
-		thresh = 256 * 1024;
+	if (em) {
+		end = extent_map_end(em);
+		free_extent_map(em);
+		if (end - offset > thresh)
+			return 0;
+	}
+	/* if we already have a nice delalloc here, just stop */
+	thresh /= 2;
+	end = count_range_bits(io_tree, &offset, offset + thresh,
+			       thresh, EXTENT_DELALLOC, 1);
+	if (end >= thresh)
+		return 0;
+	return 1;
+}
 
-	/*
-	 * make sure that once we start defragging and extent, we keep on
-	 * defragging it
-	 */
-	if (start < *defrag_end)
-		return 1;
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+			    struct inode *inode, u64 newer_than,
+			    u64 *off, int thresh)
+{
+	struct btrfs_path *path;
+	struct btrfs_key min_key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *extent;
+	int type;
+	int ret;
+	u64 ino = btrfs_ino(inode);
 
-	*skip = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	min_key.objectid = ino;
+	min_key.type = BTRFS_EXTENT_DATA_KEY;
+	min_key.offset = *off;
+
+	while (1) {
+		path->keep_locks = 1;
+		ret = btrfs_search_forward(root, &min_key, path, newer_than);
+		if (ret != 0)
+			goto none;
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+process_slot:
+		if (min_key.objectid != ino)
+			goto none;
+		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto none;
+
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_file_extent_item);
+
+		type = btrfs_file_extent_type(leaf, extent);
+		if (type == BTRFS_FILE_EXTENT_REG &&
+		    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+		    check_defrag_in_cache(inode, min_key.offset, thresh)) {
+			*off = min_key.offset;
+			btrfs_free_path(path);
+			return 0;
+		}
+
+		path->slots[0]++;
+		if (path->slots[0] < btrfs_header_nritems(leaf)) {
+			btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+			goto process_slot;
+		}
+
+		if (min_key.offset == (u64)-1)
+			goto none;
+
+		min_key.offset++;
+		btrfs_release_path(path);
+	}
+none:
+	btrfs_free_path(path);
+	return -ENOENT;
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	u64 len = PAGE_CACHE_SIZE;
 
 	/*
 	 * hopefully we have this extent in the tree already, try without
@@ -595,25 +1027,76 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
+		struct extent_state *cached = NULL;
+		u64 end = start + len - 1;
+
 		/* get the big lock and read metadata off disk */
-		lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		lock_extent_bits(io_tree, start, end, 0, &cached);
 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-		unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
 
 		if (IS_ERR(em))
-			return 0;
+			return NULL;
 	}
 
+	return em;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+{
+	struct extent_map *next;
+	bool ret = true;
+
+	/* this is the last extent */
+	if (em->start + em->len >= i_size_read(inode))
+		return false;
+
+	next = defrag_lookup_extent(inode, em->start + em->len);
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+	    (em->block_start + em->block_len == next->block_start))
+		ret = false;
+
+	free_extent_map(next);
+	return ret;
+}
+
+static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+			       u64 *last_len, u64 *skip, u64 *defrag_end,
+			       int compress)
+{
+	struct extent_map *em;
+	int ret = 1;
+	bool next_mergeable = true;
+
+	/*
+	 * make sure that once we start defragging an extent, we keep on
+	 * defragging it
+	 */
+	if (start < *defrag_end)
+		return 1;
+
+	*skip = 0;
+
+	em = defrag_lookup_extent(inode, start);
+	if (!em)
+		return 0;
+
 	/* this will cover holes, and inline extents */
-	if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 		ret = 0;
+		goto out;
+	}
+
+	next_mergeable = defrag_check_next_extent(inode, em);
 
 	/*
-	 * we hit a real extent, if it is big don't bother defragging it again
+	 * we hit a real extent, if it is big or the next extent is not a
+	 * real extent, don't bother defragging it
 	 */
-	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+	if (!compress && (*last_len == 0 || *last_len >= thresh) &&
+	    (em->len >= thresh || !next_mergeable))
 		ret = 0;
-
+out:
 	/*
 	 * last_len ends up being a counter of how many bytes we've defragged.
 	 * every time we choose not to defrag an extent, we reset *last_len
@@ -623,7 +1106,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	 * extent will force at least part of that big extent to be defragged.
 	 */
 	if (ret) {
-		*last_len += len;
 		*defrag_end = extent_map_end(em);
 	} else {
 		*last_len = 0;
@@ -635,27 +1117,191 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	return ret;
 }
 
-static int btrfs_defrag_file(struct file *file,
-			     struct btrfs_ioctl_defrag_range_args *range)
+/*
+ * it doesn't do much good to defrag one or two pages
+ * at a time.  This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+				    struct page **pages,
+				    unsigned long start_index,
+				    unsigned long num_pages)
 {
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_ordered_extent *ordered;
-	struct page *page;
-	struct btrfs_super_block *disk_super;
-	unsigned long last_index;
-	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
-	unsigned long total_read = 0;
-	u64 features;
+	unsigned long file_end;
+	u64 isize = i_size_read(inode);
 	u64 page_start;
 	u64 page_end;
+	u64 page_cnt;
+	int ret;
+	int i;
+	int i_done;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *tree;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+
+	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+	if (!isize || start_index > file_end)
+		return 0;
+
+	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+
+	ret = btrfs_delalloc_reserve_space(inode,
+					   page_cnt << PAGE_CACHE_SHIFT);
+	if (ret)
+		return ret;
+	i_done = 0;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	/* step one, lock all the pages */
+	for (i = 0; i < page_cnt; i++) {
+		struct page *page;
+again:
+		page = find_or_create_page(inode->i_mapping,
+					   start_index + i, mask);
+		if (!page)
+			break;
+
+		page_start = page_offset(page);
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		while (1) {
+			lock_extent_bits(tree, page_start, page_end,
+					 0, &cached_state);
+			ordered = btrfs_lookup_ordered_extent(inode,
+							      page_start);
+			unlock_extent_cached(tree, page_start, page_end,
+					     &cached_state, GFP_NOFS);
+			if (!ordered)
+				break;
+
+			unlock_page(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			lock_page(page);
+			/*
+			 * we unlocked the page above, so we need check if
+			 * it was released or not.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+
+		if (page->mapping != inode->i_mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+
+		pages[i] = page;
+		i_done++;
+	}
+	if (!i_done || ret)
+		goto out;
+
+	if (!(inode->i_sb->s_flags & MS_ACTIVE))
+		goto out;
+
+	/*
+	 * so now we have a nice long stream of locked
+	 * and up to date pages, lets wait on them
+	 */
+	for (i = 0; i < i_done; i++)
+		wait_on_page_writeback(pages[i]);
+
+	page_start = page_offset(pages[0]);
+	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree,
+			 page_start, page_end - 1, 0, &cached_state);
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+			  &cached_state, GFP_NOFS);
+
+	if (i_done != page_cnt) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents++;
+		spin_unlock(&BTRFS_I(inode)->lock);
+		btrfs_delalloc_release_space(inode,
+				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+	}
+
+
+	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+			  &cached_state, GFP_NOFS);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+			     page_start, page_end - 1, &cached_state,
+			     GFP_NOFS);
+
+	for (i = 0; i < i_done; i++) {
+		clear_page_dirty_for_io(pages[i]);
+		ClearPageChecked(pages[i]);
+		set_page_extent_mapped(pages[i]);
+		set_page_dirty(pages[i]);
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	return i_done;
+out:
+	for (i = 0; i < i_done; i++) {
+		unlock_page(pages[i]);
+		page_cache_release(pages[i]);
+	}
+	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+	return ret;
+
+}
+
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_to_defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file_ra_state *ra = NULL;
+	unsigned long last_index;
+	u64 isize = i_size_read(inode);
 	u64 last_len = 0;
 	u64 skip = 0;
 	u64 defrag_end = 0;
+	u64 newer_off = range->start;
 	unsigned long i;
+	unsigned long ra_index = 0;
 	int ret;
+	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
+	int extent_thresh = range->extent_thresh;
+	unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long cluster = max_cluster;
+	u64 new_align = ~((u64)128 * 1024 - 1);
+	struct page **pages = NULL;
+
+	if (isize == 0)
+		return 0;
+
+	if (range->start >= isize)
+		return -EINVAL;
 
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
 		if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -664,23 +1310,82 @@ static int btrfs_defrag_file(struct file *file,
 			compress_type = range->compress_type;
 	}
 
-	if (inode->i_size == 0)
-		return 0;
+	if (extent_thresh == 0)
+		extent_thresh = 256 * 1024;
+
+	/*
+	 * if we were not given a file, allocate a readahead
+	 * context
+	 */
+	if (!file) {
+		ra = kzalloc(sizeof(*ra), GFP_NOFS);
+		if (!ra)
+			return -ENOMEM;
+		file_ra_state_init(ra, inode->i_mapping);
+	} else {
+		ra = &file->f_ra;
+	}
 
+	pages = kmalloc_array(max_cluster, sizeof(struct page *),
+			GFP_NOFS);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out_ra;
+	}
+
+	/* find the last page to defrag */
 	if (range->start + range->len > range->start) {
-		last_index = min_t(u64, inode->i_size - 1,
+		last_index = min_t(u64, isize - 1,
 			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
 	} else {
-		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (newer_than) {
+		ret = find_new_extents(root, inode, newer_than,
+				       &newer_off, 64 * 1024);
+		if (!ret) {
+			range->start = newer_off;
+			/*
+			 * we always align our defrag to help keep
+			 * the extents in the file evenly spaced
+			 */
+			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+		} else
+			goto out_ra;
+	} else {
+		i = range->start >> PAGE_CACHE_SHIFT;
 	}
+	if (!max_to_defrag)
+		max_to_defrag = last_index + 1;
+
+	/*
+	 * make writeback starts from i, so the defrag range can be
+	 * written sequentially.
+	 */
+	if (i < inode->i_mapping->writeback_index)
+		inode->i_mapping->writeback_index = i;
+
+	while (i <= last_index && defrag_count < max_to_defrag &&
+	       (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT)) {
+		/*
+		 * make sure we stop running if someone unmounts
+		 * the FS
+		 */
+		if (!(inode->i_sb->s_flags & MS_ACTIVE))
+			break;
+
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
 
-	i = range->start >> PAGE_CACHE_SHIFT;
-	while (i <= last_index) {
 		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
-					PAGE_CACHE_SIZE,
-					range->extent_thresh,
-					&last_len, &skip,
-					&defrag_end)) {
+					 extent_thresh, &last_len, &skip,
+					 &defrag_end, range->flags &
+					 BTRFS_DEFRAG_RANGE_COMPRESS)) {
 			unsigned long next;
 			/*
 			 * the should_defrag function tells us how much to skip
@@ -691,95 +1396,70 @@ static int btrfs_defrag_file(struct file *file,
 			continue;
 		}
 
-		if (total_read % ra_pages == 0) {
-			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
-				       min(last_index, i + ra_pages - 1));
-		}
-		total_read++;
-		mutex_lock(&inode->i_mutex);
-		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = compress_type;
-
-		ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-		if (ret)
-			goto err_unlock;
-again:
-		if (inode->i_size == 0 ||
-		    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
-			ret = 0;
-			goto err_reservations;
+		if (!newer_than) {
+			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+				   PAGE_CACHE_SHIFT) - i;
+			cluster = min(cluster, max_cluster);
+		} else {
+			cluster = max_cluster;
 		}
 
-		page = grab_cache_page(inode->i_mapping, i);
-		if (!page) {
-			ret = -ENOMEM;
-			goto err_reservations;
+		if (i + cluster > ra_index) {
+			ra_index = max(i, ra_index);
+			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+				       cluster);
+			ra_index += max_cluster;
 		}
 
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				ret = -EIO;
-				goto err_reservations;
-			}
-		}
-
-		if (page->mapping != inode->i_mapping) {
-			unlock_page(page);
-			page_cache_release(page);
-			goto again;
+		mutex_lock(&inode->i_mutex);
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = compress_type;
+		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+		if (ret < 0) {
+			mutex_unlock(&inode->i_mutex);
+			goto out_ra;
 		}
 
-		wait_on_page_writeback(page);
-
-		if (PageDirty(page)) {
-			btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-			goto loop_unlock;
-		}
+		defrag_count += ret;
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		mutex_unlock(&inode->i_mutex);
 
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-		page_end = page_start + PAGE_CACHE_SIZE - 1;
-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		if (newer_than) {
+			if (newer_off == (u64)-1)
+				break;
 
-		ordered = btrfs_lookup_ordered_extent(inode, page_start);
-		if (ordered) {
-			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-			unlock_page(page);
-			page_cache_release(page);
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			goto again;
+			if (ret > 0)
+				i += ret;
+
+			newer_off = max(newer_off + 1,
+					(u64)i << PAGE_CACHE_SHIFT);
+
+			ret = find_new_extents(root, inode,
+					       newer_than, &newer_off,
+					       64 * 1024);
+			if (!ret) {
+				range->start = newer_off;
+				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+			} else {
+				break;
+			}
+		} else {
+			if (ret > 0) {
+				i += ret;
+				last_len += ret << PAGE_CACHE_SHIFT;
+			} else {
+				i++;
+				last_len = 0;
+			}
 		}
-		set_page_extent_mapped(page);
-
-		/*
-		 * this makes sure page_mkwrite is called on the
-		 * page if it is dirtied again later
-		 */
-		clear_page_dirty_for_io(page);
-		clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
-				  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING, GFP_NOFS);
-
-		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
-		ClearPageChecked(page);
-		set_page_dirty(page);
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-loop_unlock:
-		unlock_page(page);
-		page_cache_release(page);
-		mutex_unlock(&inode->i_mutex);
-
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
-		i++;
 	}
 
-	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
 		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
 
 	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
 		/* the filemap_flush will queue IO into the worker threads, but
@@ -794,73 +1474,96 @@ loop_unlock:
 			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 		}
 		atomic_dec(&root->fs_info->async_submit_draining);
-
-		mutex_lock(&inode->i_mutex);
-		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
-		mutex_unlock(&inode->i_mutex);
 	}
 
-	disk_super = &root->fs_info->super_copy;
-	features = btrfs_super_incompat_flags(disk_super);
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
-		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
-		btrfs_set_super_incompat_flags(disk_super, features);
+		btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
 	}
 
-	return 0;
+	ret = defrag_count;
 
-err_reservations:
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-err_unlock:
-	mutex_unlock(&inode->i_mutex);
+out_ra:
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		mutex_unlock(&inode->i_mutex);
+	}
+	if (!file)
+		kfree(ra);
+	kfree(pages);
 	return ret;
 }
 
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
 					void __user *arg)
 {
 	u64 new_size;
 	u64 old_size;
 	u64 devid = 1;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device = NULL;
 	char *sizestr;
+	char *retptr;
 	char *devstr = NULL;
 	int ret = 0;
 	int mod = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		mnt_drop_write_file(file);
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
-		char *end;
 		sizestr = devstr + 1;
 		*devstr = '\0';
 		devstr = vol_args->name;
-		devid = simple_strtoull(devstr, &end, 10);
-		printk(KERN_INFO "resizing devid %llu\n",
-		       (unsigned long long)devid);
+		ret = kstrtoull(devstr, 10, &devid);
+		if (ret)
+			goto out_free;
+		if (!devid) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+		btrfs_info(root->fs_info, "resizing devid %llu", devid);
 	}
-	device = btrfs_find_device(root, devid, NULL, NULL);
+
+	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (!device) {
-		printk(KERN_INFO "resizer unable to find device %llu\n",
-		       (unsigned long long)devid);
-		ret = -EINVAL;
-		goto out_unlock;
+		btrfs_info(root->fs_info, "resizer unable to find device %llu",
+		       devid);
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	if (!device->writeable) {
+		btrfs_info(root->fs_info,
+			   "resizer unable to apply on readonly device %llu",
+		       devid);
+		ret = -EPERM;
+		goto out_free;
 	}
+
 	if (!strcmp(sizestr, "max"))
 		new_size = device->bdev->bd_inode->i_size;
 	else {
@@ -871,103 +1574,125 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 			mod = 1;
 			sizestr++;
 		}
-		new_size = memparse(sizestr, NULL);
-		if (new_size == 0) {
+		new_size = memparse(sizestr, &retptr);
+		if (*retptr != '\0' || new_size == 0) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 	}
 
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = -EPERM;
+		goto out_free;
+	}
+
 	old_size = device->total_bytes;
 
 	if (mod < 0) {
 		if (new_size > old_size) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 		new_size = old_size - new_size;
 	} else if (mod > 0) {
+		if (new_size > ULLONG_MAX - old_size) {
+			ret = -ERANGE;
+			goto out_free;
+		}
 		new_size = old_size + new_size;
 	}
 
 	if (new_size < 256 * 1024 * 1024) {
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free;
 	}
 	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	do_div(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-	printk(KERN_INFO "new size for %s is %llu\n",
-		device->name, (unsigned long long)new_size);
+	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+		      rcu_str_deref(device->name), new_size);
 
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
-			goto out_unlock;
+			goto out_free;
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
-	} else {
+	} else if (new_size < old_size) {
 		ret = btrfs_shrink_device(device, new_size);
-	}
+	} /* equal, nothing need to do */
 
-out_unlock:
-	mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-						    char *name,
-						    unsigned long fd,
-						    int subvol,
-						    u64 *transid,
-						    bool readonly)
-{
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
-	struct file *src_file;
+				char *name, unsigned long fd, int subvol,
+				u64 *transid, bool readonly,
+				struct btrfs_qgroup_inherit *inherit)
+{
 	int namelen;
 	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
 
 	namelen = strlen(name);
 	if (strchr(name, '/')) {
 		ret = -EINVAL;
-		goto out;
+		goto out_drop_write;
+	}
+
+	if (name[0] == '.' &&
+	   (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+		ret = -EEXIST;
+		goto out_drop_write;
 	}
 
 	if (subvol) {
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     NULL, transid, readonly);
+				     NULL, transid, readonly, inherit);
 	} else {
+		struct fd src = fdget(fd);
 		struct inode *src_inode;
-		src_file = fget(fd);
-		if (!src_file) {
+		if (!src.file) {
 			ret = -EINVAL;
-			goto out;
+			goto out_drop_write;
 		}
 
-		src_inode = src_file->f_path.dentry->d_inode;
-		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
-			printk(KERN_INFO "btrfs: Snapshot src from "
-			       "another FS\n");
-			ret = -EINVAL;
-			fput(src_file);
-			goto out;
+		src_inode = file_inode(src.file);
+		if (src_inode->i_sb != file_inode(file)->i_sb) {
+			btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+				   "Snapshot src from another FS");
+			ret = -EXDEV;
+		} else if (!inode_owner_or_capable(src_inode)) {
+			/*
+			 * Subvolume creation is not restricted, but snapshots
+			 * are limited to own subvolumes only
+			 */
+			ret = -EPERM;
+		} else {
+			ret = btrfs_mksubvol(&file->f_path, name, namelen,
+					     BTRFS_I(src_inode)->root,
+					     transid, readonly, inherit);
 		}
-		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     BTRFS_I(src_inode)->root,
-				     transid, readonly);
-		fput(src_file);
+		fdput(src);
 	}
+out_drop_write:
+	mnt_drop_write_file(file);
 out:
 	return ret;
 }
@@ -985,7 +1710,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
 					      vol_args->fd, subvol,
-					      NULL, false);
+					      NULL, false, NULL);
 
 	kfree(vol_args);
 	return ret;
@@ -999,6 +1724,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	u64 transid = 0;
 	u64 *ptr = NULL;
 	bool readonly = false;
+	struct btrfs_qgroup_inherit *inherit = NULL;
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
@@ -1006,7 +1732,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
 	if (vol_args->flags &
-	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+	      BTRFS_SUBVOL_QGROUP_INHERIT)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -1015,10 +1742,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 		ptr = &transid;
 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
 		readonly = true;
+	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+		if (vol_args->size > PAGE_CACHE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+		if (IS_ERR(inherit)) {
+			ret = PTR_ERR(inherit);
+			goto out;
+		}
+	}
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-					      vol_args->fd, subvol,
-					      ptr, readonly);
+					      vol_args->fd, subvol, ptr,
+					      readonly, inherit);
 
 	if (ret == 0 && ptr &&
 	    copy_to_user(arg +
@@ -1027,18 +1765,19 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 		ret = -EFAULT;
 out:
 	kfree(vol_args);
+	kfree(inherit);
 	return ret;
 }
 
 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
 						void __user *arg)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	u64 flags = 0;
 
-	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
 		return -EINVAL;
 
 	down_read(&root->fs_info->subvol_sem);
@@ -1055,44 +1794,69 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
 static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 					      void __user *arg)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 root_flags;
 	u64 flags;
 	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
 
-	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
-		return -EINVAL;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
 
-	if (copy_from_user(&flags, arg, sizeof(flags)))
-		return -EFAULT;
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
 
-	if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
-		return -EINVAL;
+	if (copy_from_user(&flags, arg, sizeof(flags))) {
+		ret = -EFAULT;
+		goto out_drop_write;
+	}
 
-	if (flags & ~BTRFS_SUBVOL_RDONLY)
-		return -EOPNOTSUPP;
+	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
 
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
+	if (flags & ~BTRFS_SUBVOL_RDONLY) {
+		ret = -EOPNOTSUPP;
+		goto out_drop_write;
+	}
 
 	down_write(&root->fs_info->subvol_sem);
 
 	/* nothing to do */
 	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
-		goto out;
+		goto out_drop_sem;
 
 	root_flags = btrfs_root_flags(&root->root_item);
-	if (flags & BTRFS_SUBVOL_RDONLY)
+	if (flags & BTRFS_SUBVOL_RDONLY) {
 		btrfs_set_root_flags(&root->root_item,
 				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
-	else
-		btrfs_set_root_flags(&root->root_item,
+	} else {
+		/*
+		 * Block RO -> RW transition if this subvolume is involved in
+		 * send
+		 */
+		spin_lock(&root->root_item_lock);
+		if (root->send_in_progress == 0) {
+			btrfs_set_root_flags(&root->root_item,
 				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+			spin_unlock(&root->root_item_lock);
+		} else {
+			spin_unlock(&root->root_item_lock);
+			btrfs_warn(root->fs_info,
+			"Attempt to set subvolume %llu read-write during send",
+					root->root_key.objectid);
+			ret = -EPERM;
+			goto out_drop_sem;
+		}
+	}
 
 	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
@@ -1107,8 +1871,11 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 out_reset:
 	if (ret)
 		btrfs_set_root_flags(&root->root_item, root_flags);
-out:
+out_drop_sem:
 	up_write(&root->fs_info->subvol_sem);
+out_drop_write:
+	mnt_drop_write_file(file);
+out:
 	return ret;
 }
 
@@ -1118,13 +1885,30 @@ out:
 static noinline int may_destroy_subvol(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	u64 dir_id;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+	/* Make sure this root isn't set as the default subvol */
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+		if (key.objectid == root->root_key.objectid) {
+			ret = -EPERM;
+			btrfs_err(root->fs_info, "deleting default subvolume "
+				  "%llu is not allowed", key.objectid);
+			goto out;
+		}
+		btrfs_release_path(path);
+	}
+
 	key.objectid = root->root_key.objectid;
 	key.type = BTRFS_ROOT_REF_KEY;
 	key.offset = (u64)-1;
@@ -1176,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 			       struct btrfs_path *path,
 			       struct btrfs_key *key,
 			       struct btrfs_ioctl_search_key *sk,
-			       char *buf,
+			       size_t *buf_size,
+			       char __user *ubuf,
 			       unsigned long *sk_offset,
 			       int *num_found)
 {
@@ -1188,7 +1973,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 	int nritems;
 	int i;
 	int slot;
-	int found = 0;
 	int ret = 0;
 
 	leaf = path->nodes[0];
@@ -1205,19 +1989,31 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 		item_off = btrfs_item_ptr_offset(leaf, i);
 		item_len = btrfs_item_size_nr(leaf, i);
 
-		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		if (sizeof(sh) + item_len > *buf_size) {
+			if (*num_found) {
+				ret = 1;
+				goto out;
+			}
+
+			/*
+			 * return one empty item back for v1, which does not
+			 * handle -EOVERFLOW
+			 */
+
+			*buf_size = sizeof(sh) + item_len;
 			item_len = 0;
+			ret = -EOVERFLOW;
+		}
 
-		if (sizeof(sh) + item_len + *sk_offset >
-		    BTRFS_SEARCH_ARGS_BUFSIZE) {
+		if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
 			ret = 1;
-			goto overflow;
+			goto out;
 		}
 
-		btrfs_item_key_to_cpu(leaf, key, i);
-		if (!key_in_sk(key, sk))
-			continue;
-
 		sh.objectid = key->objectid;
 		sh.offset = key->offset;
 		sh.type = key->type;
@@ -1225,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 		sh.transid = found_transid;
 
 		/* copy search result header */
-		memcpy(buf + *sk_offset, &sh, sizeof(sh));
+		if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
 		*sk_offset += sizeof(sh);
 
 		if (item_len) {
-			char *p = buf + *sk_offset;
+			char __user *up = ubuf + *sk_offset;
 			/* copy the item */
-			read_extent_buffer(leaf, p,
-					   item_off, item_len);
+			if (read_extent_buffer_to_user(leaf, up,
+						       item_off, item_len)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
 			*sk_offset += item_len;
 		}
-		found++;
+		(*num_found)++;
 
-		if (*num_found >= sk->nr_items)
-			break;
+		if (ret) /* -EOVERFLOW from above */
+			goto out;
+
+		if (*num_found >= sk->nr_items) {
+			ret = 1;
+			goto out;
+		}
 	}
 advance_key:
 	ret = 0;
@@ -1253,24 +2062,37 @@ advance_key:
 		key->objectid++;
 	} else
 		ret = 1;
-overflow:
-	*num_found += found;
+out:
+	/*
+	 *  0: all items from this leaf copied, continue with next
+	 *  1: * more items can be copied, but unused buffer is too small
+	 *     * all items were found
+	 *     Either way, it will stops the loop which iterates to the next
+	 *     leaf
+	 *  -EOVERFLOW: item was to large for buffer
+	 *  -EFAULT: could not copy extent buffer back to userspace
+	 */
 	return ret;
 }
 
 static noinline int search_ioctl(struct inode *inode,
-				 struct btrfs_ioctl_search_args *args)
+				 struct btrfs_ioctl_search_key *sk,
+				 size_t *buf_size,
+				 char __user *ubuf)
 {
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	struct btrfs_key max_key;
 	struct btrfs_path *path;
-	struct btrfs_ioctl_search_key *sk = &args->key;
 	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
 	int ret;
 	int num_found = 0;
 	unsigned long sk_offset = 0;
 
+	if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+		*buf_size = sizeof(struct btrfs_ioctl_search_header);
+		return -EOVERFLOW;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1284,7 +2106,7 @@ static noinline int search_ioctl(struct inode *inode,
 		key.offset = (u64)-1;
 		root = btrfs_read_fs_root_no_name(info, &key);
 		if (IS_ERR(root)) {
-			printk(KERN_ERR "could not find root %llu\n",
+			printk(KERN_ERR "BTRFS: could not find root %llu\n",
 			       sk->tree_id);
 			btrfs_free_path(path);
 			return -ENOENT;
@@ -1295,28 +2117,24 @@ static noinline int search_ioctl(struct inode *inode,
 	key.type = sk->min_type;
 	key.offset = sk->min_offset;
 
-	max_key.objectid = sk->max_objectid;
-	max_key.type = sk->max_type;
-	max_key.offset = sk->max_offset;
-
 	path->keep_locks = 1;
 
-	while(1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
-					   sk->min_transid);
+	while (1) {
+		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
 				ret = 0;
 			goto err;
 		}
-		ret = copy_to_sk(root, path, &key, sk, args->buf,
+		ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
 				 &sk_offset, &num_found);
-		btrfs_release_path(root, path);
-		if (ret || num_found >= sk->nr_items)
+		btrfs_release_path(path);
+		if (ret)
 			break;
 
 	}
-	ret = 0;
+	if (ret > 0)
+		ret = 0;
 err:
 	sk->nr_items = num_found;
 	btrfs_free_path(path);
@@ -1326,22 +2144,73 @@ err:
 static noinline int btrfs_ioctl_tree_search(struct file *file,
 					   void __user *argp)
 {
-	 struct btrfs_ioctl_search_args *args;
-	 struct inode *inode;
-	 int ret;
+	struct btrfs_ioctl_search_args __user *uargs;
+	struct btrfs_ioctl_search_key sk;
+	struct inode *inode;
+	int ret;
+	size_t buf_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	args = memdup_user(argp, sizeof(*args));
-	if (IS_ERR(args))
-		return PTR_ERR(args);
+	uargs = (struct btrfs_ioctl_search_args __user *)argp;
 
-	inode = fdentry(file)->d_inode;
-	ret = search_ioctl(inode, args);
-	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+	if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+		return -EFAULT;
+
+	buf_size = sizeof(uargs->buf);
+
+	inode = file_inode(file);
+	ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+	/*
+	 * In the origin implementation an overflow is handled by returning a
+	 * search header with a len of zero, so reset ret.
+	 */
+	if (ret == -EOVERFLOW)
+		ret = 0;
+
+	if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
 		ret = -EFAULT;
-	kfree(args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+					       void __user *argp)
+{
+	struct btrfs_ioctl_search_args_v2 __user *uarg;
+	struct btrfs_ioctl_search_args_v2 args;
+	struct inode *inode;
+	int ret;
+	size_t buf_size;
+	const size_t buf_limit = 16 * 1024 * 1024;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* copy search header and buffer size */
+	uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+	if (copy_from_user(&args, uarg, sizeof(args)))
+		return -EFAULT;
+
+	buf_size = args.buf_size;
+
+	if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+		return -EOVERFLOW;
+
+	/* limit result size to 16MB */
+	if (buf_size > buf_limit)
+		buf_size = buf_limit;
+
+	inode = file_inode(file);
+	ret = search_ioctl(inode, &args.key, &buf_size,
+			   (char *)(&uarg->buf[0]));
+	if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+		ret = -EFAULT;
+	else if (ret == -EOVERFLOW &&
+		copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+		ret = -EFAULT;
+
 	return ret;
 }
 
@@ -1379,7 +2248,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.offset = (u64)-1;
 	root = btrfs_read_fs_root_no_name(info, &key);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "could not find root %llu\n", tree_id);
+		printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1388,46 +2257,47 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
+		else if (ret > 0) {
+			ret = btrfs_previous_item(root, path, dirid,
+						  BTRFS_INODE_REF_KEY);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+		}
 
 		l = path->nodes[0];
 		slot = path->slots[0];
-		if (ret > 0 && slot > 0)
-			slot--;
 		btrfs_item_key_to_cpu(l, &key, slot);
 
-		if (ret > 0 && (key.objectid != dirid ||
-				key.type != BTRFS_INODE_REF_KEY)) {
-			ret = -ENOENT;
-			goto out;
-		}
-
 		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
 		len = btrfs_inode_ref_name_len(l, iref);
 		ptr -= len + 1;
 		total_len += len + 1;
-		if (ptr < name)
+		if (ptr < name) {
+			ret = -ENAMETOOLONG;
 			goto out;
+		}
 
 		*(ptr + len) = '/';
-		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+		read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
 
 		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
 			break;
 
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		key.objectid = key.offset;
 		key.offset = (u64)-1;
 		dirid = key.objectid;
-
 	}
-	if (ptr < name)
-		goto out;
-	memcpy(name, ptr, total_len);
-	name[total_len]='\0';
+	memmove(name, ptr, total_len);
+	name[total_len] = '\0';
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -1448,7 +2318,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 	if (IS_ERR(args))
 		return PTR_ERR(args);
 
-	inode = fdentry(file)->d_inode;
+	inode = file_inode(file);
 
 	if (args->treeid == 0)
 		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
@@ -1467,7 +2337,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
-	struct dentry *parent = fdentry(file);
+	struct dentry *parent = file->f_path.dentry;
 	struct dentry *dentry;
 	struct inode *dir = parent->d_inode;
 	struct inode *inode;
@@ -1475,6 +2345,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	struct btrfs_root *dest = NULL;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv block_rsv;
+	u64 root_flags;
+	u64 qgroup_reserved;
 	int namelen;
 	int ret;
 	int err = 0;
@@ -1491,11 +2364,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out;
 	}
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		goto out;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+
+	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (err == -EINTR)
+		goto out_drop_write;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -1509,7 +2385,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 	inode = dentry->d_inode;
 	dest = BTRFS_I(inode)->root;
-	if (!capable(CAP_SYS_ADMIN)){
+	if (!capable(CAP_SYS_ADMIN)) {
 		/*
 		 * Regular user.  Only allow this with a special mount
 		 * option, when the user has write+exec access to the
@@ -1541,19 +2417,40 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
 		if (err)
 			goto out_dput;
-
-		/* check if subvolume may be deleted by a non-root user */
-		err = btrfs_may_delete(dir, dentry, 1);
-		if (err)
-			goto out_dput;
 	}
 
-	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+	/* check if subvolume may be deleted by a user */
+	err = btrfs_may_delete(dir, dentry, 1);
+	if (err)
+		goto out_dput;
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
 		err = -EINVAL;
 		goto out_dput;
 	}
 
 	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Don't allow to delete a subvolume with send in progress. This is
+	 * inside the i_mutex so the error handling that has to drop the bit
+	 * again is not run concurrently.
+	 */
+	spin_lock(&dest->root_item_lock);
+	root_flags = btrfs_root_flags(&dest->root_item);
+	if (dest->send_in_progress == 0) {
+		btrfs_set_root_flags(&dest->root_item,
+				root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+		spin_unlock(&dest->root_item_lock);
+	} else {
+		spin_unlock(&dest->root_item_lock);
+		btrfs_warn(root->fs_info,
+			"Attempt to delete subvolume %llu during send",
+			dest->root_key.objectid);
+		err = -EPERM;
+		goto out_dput;
+	}
+
 	err = d_invalidate(dentry);
 	if (err)
 		goto out_unlock;
@@ -1564,18 +2461,33 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	if (err)
 		goto out_up_write;
 
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * One for dir inode, two for dir entries, two for root
+	 * ref/backref.
+	 */
+	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       5, &qgroup_reserved, true);
+	if (err)
+		goto out_up_write;
+
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
-		goto out_up_write;
+		goto out_release;
 	}
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
 
 	ret = btrfs_unlink_subvol(trans, root, dir,
 				dest->root_key.objectid,
 				dentry->d_name.name,
 				dentry->d_name.len);
-	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_end_trans;
+	}
 
 	btrfs_record_root_in_trans(trans, dest);
 
@@ -1584,30 +2496,75 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	dest->root_item.drop_level = 0;
 	btrfs_set_root_refs(&dest->root_item, 0);
 
-	if (!xchg(&dest->orphan_item_inserted, 1)) {
+	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
 		ret = btrfs_insert_orphan_item(trans,
 					root->fs_info->tree_root,
 					dest->root_key.objectid);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
 	}
 
+	ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				  dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  dest->root_key.objectid);
+	if (ret && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		err = ret;
+		goto out_end_trans;
+	}
+	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+		ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+					  dest->root_item.received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  dest->root_key.objectid);
+		if (ret && ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
+out_end_trans:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
 	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
+	if (ret && !err)
+		err = ret;
 	inode->i_flags |= S_DEAD;
+out_release:
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 out_up_write:
 	up_write(&root->fs_info->subvol_sem);
 out_unlock:
+	if (err) {
+		spin_lock(&dest->root_item_lock);
+		root_flags = btrfs_root_flags(&dest->root_item);
+		btrfs_set_root_flags(&dest->root_item,
+				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+		spin_unlock(&dest->root_item_lock);
+	}
 	mutex_unlock(&inode->i_mutex);
 	if (!err) {
 		shrink_dcache_sb(root->fs_info->sb);
 		btrfs_invalidate_inodes(dest);
 		d_delete(dentry);
+		ASSERT(dest->send_in_progress == 0);
+
+		/* the last ref */
+		if (dest->cache_inode) {
+			iput(dest->cache_inode);
+			dest->cache_inode = NULL;
+		}
 	}
 out_dput:
 	dput(dentry);
 out_unlock_dir:
 	mutex_unlock(&dir->i_mutex);
-	mnt_drop_write(file->f_path.mnt);
+out_drop_write:
+	mnt_drop_write_file(file);
 out:
 	kfree(vol_args);
 	return err;
@@ -1615,28 +2572,30 @@ out:
 
 static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
-
-	ret = mnt_want_write(file->f_path.mnt);
+	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
 
+	if (btrfs_root_readonly(root)) {
+		ret = -EROFS;
+		goto out;
+	}
+
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
 		if (!capable(CAP_SYS_ADMIN)) {
 			ret = -EPERM;
 			goto out;
 		}
-		ret = btrfs_defrag_root(root, 0);
+		ret = btrfs_defrag_root(root);
 		if (ret)
 			goto out;
-		ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+		ret = btrfs_defrag_root(root->fs_info->extent_root);
 		break;
 	case S_IFREG:
 		if (!(file->f_mode & FMODE_WRITE)) {
@@ -1666,14 +2625,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 			/* the rest are all set to zero by kzalloc */
 			range->len = (u64)-1;
 		}
-		ret = btrfs_defrag_file(file, range);
+		ret = btrfs_defrag_file(file_inode(file), file,
+					range, 0, 0);
+		if (ret > 0)
+			ret = 0;
 		kfree(range);
 		break;
 	default:
 		ret = -EINVAL;
 	}
 out:
-	mnt_drop_write(file->f_path.mnt);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -1685,171 +2647,628 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	return ret;
 }
 
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
 	ret = btrfs_rm_device(root, vol_args->name);
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 
+out:
 	kfree(vol_args);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
-				       u64 off, u64 olen, u64 destoff)
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_fs_info_args *fi_args;
+	struct btrfs_device *device;
+	struct btrfs_device *next;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int ret = 0;
+
+	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
+	if (!fi_args)
+		return -ENOMEM;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	fi_args->num_devices = fs_devices->num_devices;
+	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+		if (device->devid > fi_args->max_id)
+			fi_args->max_id = device->devid;
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	fi_args->nodesize = root->fs_info->super_copy->nodesize;
+	fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
+	fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+
+	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
+		ret = -EFAULT;
+
+	kfree(fi_args);
+	return ret;
+}
+
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_info_args *di_args;
+	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int ret = 0;
+	char *s_uuid = NULL;
+
+	di_args = memdup_user(arg, sizeof(*di_args));
+	if (IS_ERR(di_args))
+		return PTR_ERR(di_args);
+
+	if (!btrfs_is_empty_uuid(di_args->uuid))
+		s_uuid = di_args->uuid;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
+
+	if (!dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	di_args->devid = dev->devid;
+	di_args->bytes_used = dev->bytes_used;
+	di_args->total_bytes = dev->total_bytes;
+	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+	if (dev->name) {
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		strncpy(di_args->path, name->str, sizeof(di_args->path));
+		rcu_read_unlock();
+		di_args->path[sizeof(di_args->path) - 1] = 0;
+	} else {
+		di_args->path[0] = '\0';
+	}
+
+out:
+	mutex_unlock(&fs_devices->device_list_mutex);
+	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+		ret = -EFAULT;
+
+	kfree(di_args);
+	return ret;
+}
+
+static struct page *extent_same_get_page(struct inode *inode, u64 off)
+{
+	struct page *page;
+	pgoff_t index;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+	index = off >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(inode->i_mapping, index);
+	if (!page)
+		return NULL;
+
+	if (!PageUptodate(page)) {
+		if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+						 0))
+			return NULL;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			return NULL;
+		}
+	}
+	unlock_page(page);
+
+	return page;
+}
+
+static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+{
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    off + len - 1);
+		if ((!ordered ||
+		     ordered->file_offset + ordered->len <= off ||
+		     ordered->file_offset >= off + len) &&
+		    !test_range_bit(&BTRFS_I(inode)->io_tree, off,
+				    off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(inode, off, len);
+	}
+}
+
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+				struct inode *inode2, u64 loff2, u64 len)
+{
+	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+	mutex_unlock(&inode1->i_mutex);
+	mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+			      struct inode *inode2, u64 loff2, u64 len)
+{
+	if (inode1 < inode2) {
+		swap(inode1, inode2);
+		swap(loff1, loff2);
+	}
+
+	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+	lock_extent_range(inode1, loff1, len);
+	if (inode1 != inode2) {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+		lock_extent_range(inode2, loff2, len);
+	}
+}
+
+static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
+			  u64 dst_loff, u64 len)
+{
+	int ret = 0;
+	struct page *src_page, *dst_page;
+	unsigned int cmp_len = PAGE_CACHE_SIZE;
+	void *addr, *dst_addr;
+
+	while (len) {
+		if (len < PAGE_CACHE_SIZE)
+			cmp_len = len;
+
+		src_page = extent_same_get_page(src, loff);
+		if (!src_page)
+			return -EINVAL;
+		dst_page = extent_same_get_page(dst, dst_loff);
+		if (!dst_page) {
+			page_cache_release(src_page);
+			return -EINVAL;
+		}
+		addr = kmap_atomic(src_page);
+		dst_addr = kmap_atomic(dst_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dst_page);
+
+		if (memcmp(addr, dst_addr, cmp_len))
+			ret = BTRFS_SAME_DATA_DIFFERS;
+
+		kunmap_atomic(addr);
+		kunmap_atomic(dst_addr);
+		page_cache_release(src_page);
+		page_cache_release(dst_page);
+
+		if (ret)
+			break;
+
+		loff += cmp_len;
+		dst_loff += cmp_len;
+		len -= cmp_len;
+	}
+
+	return ret;
+}
+
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+{
+	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
+
+	if (off + len > inode->i_size || off + len < off)
+		return -EINVAL;
+	/* Check that we are block aligned - btrfs_clone() requires this */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+			     struct inode *dst, u64 dst_loff)
 {
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct file *src_file;
-	struct inode *src;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	char *buf;
-	struct btrfs_key key;
-	u32 nritems;
-	int slot;
 	int ret;
-	u64 len = olen;
-	u64 bs = root->fs_info->sb->s_blocksize;
-	u64 hint_byte;
 
 	/*
-	 * TODO:
-	 * - split compressed inline extents.  annoying: we need to
-	 *   decompress into destination's address_space (the file offset
-	 *   may change, so source mapping won't do), then recompress (or
-	 *   otherwise reinsert) a subrange.
-	 * - allow ranges within the same file to be cloned (provided
-	 *   they don't overlap)?
+	 * btrfs_clone() can't handle extents in the same file
+	 * yet. Once that works, we can drop this check and replace it
+	 * with a check for the same inode, but overlapping extents.
 	 */
-
-	/* the destination must be opened for writing */
-	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+	if (src == dst)
 		return -EINVAL;
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
+	btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+	ret = extent_same_check_offsets(src, loff, len);
+	if (ret)
+		goto out_unlock;
 
-	ret = mnt_want_write(file->f_path.mnt);
+	ret = extent_same_check_offsets(dst, dst_loff, len);
+	if (ret)
+		goto out_unlock;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+	if (ret == 0)
+		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out_unlock:
+	btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+	return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+			struct btrfs_ioctl_same_args __user *argp)
+{
+	struct btrfs_ioctl_same_args *same;
+	struct btrfs_ioctl_same_extent_info *info;
+	struct inode *src = file_inode(file);
+	u64 off;
+	u64 len;
+	int i;
+	int ret;
+	unsigned long size;
+	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+	bool is_admin = capable(CAP_SYS_ADMIN);
+	u16 count;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EINVAL;
+
+	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
 
-	src_file = fget(srcfd);
-	if (!src_file) {
-		ret = -EBADF;
-		goto out_drop_write;
+	if (get_user(count, &argp->dest_count)) {
+		ret = -EFAULT;
+		goto out;
 	}
 
-	src = src_file->f_dentry->d_inode;
+	size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
 
-	ret = -EINVAL;
-	if (src == inode)
-		goto out_fput;
+	same = memdup_user(argp, size);
 
-	/* the src must be open for reading */
-	if (!(src_file->f_mode & FMODE_READ))
-		goto out_fput;
+	if (IS_ERR(same)) {
+		ret = PTR_ERR(same);
+		goto out;
+	}
+
+	off = same->logical_offset;
+	len = same->length;
+
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > BTRFS_MAX_DEDUPE_LEN)
+		len = BTRFS_MAX_DEDUPE_LEN;
+
+	if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+		/*
+		 * Btrfs does not support blocksize < page_size. As a
+		 * result, btrfs_cmp_data() won't correctly handle
+		 * this situation without an update.
+		 */
+		ret = -EINVAL;
+		goto out;
+	}
 
 	ret = -EISDIR;
-	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
-		goto out_fput;
+	if (S_ISDIR(src->i_mode))
+		goto out;
 
-	ret = -EXDEV;
-	if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
-		goto out_fput;
+	ret = -EACCES;
+	if (!S_ISREG(src->i_mode))
+		goto out;
 
-	ret = -ENOMEM;
-	buf = vmalloc(btrfs_level_size(root, 0));
-	if (!buf)
-		goto out_fput;
+	/* pre-format output fields to sane values */
+	for (i = 0; i < count; i++) {
+		same->info[i].bytes_deduped = 0ULL;
+		same->info[i].status = 0;
+	}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		vfree(buf);
-		goto out_fput;
+	for (i = 0, info = same->info; i < count; i++, info++) {
+		struct inode *dst;
+		struct fd dst_file = fdget(info->fd);
+		if (!dst_file.file) {
+			info->status = -EBADF;
+			continue;
+		}
+		dst = file_inode(dst_file.file);
+
+		if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
+			info->status = -EINVAL;
+		} else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+			info->status = -EXDEV;
+		} else if (S_ISDIR(dst->i_mode)) {
+			info->status = -EISDIR;
+		} else if (!S_ISREG(dst->i_mode)) {
+			info->status = -EACCES;
+		} else {
+			info->status = btrfs_extent_same(src, off, len, dst,
+							info->logical_offset);
+			if (info->status == 0)
+				info->bytes_deduped += len;
+		}
+		fdput(dst_file);
 	}
-	path->reada = 2;
 
-	if (inode < src) {
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-	} else {
-		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+	ret = copy_to_user(argp, same, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr.  If it does then we need to update the quota for this root.  This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     u64 disko)
+{
+	struct seq_list tree_mod_seq_elem = {};
+	struct ulist *roots;
+	struct ulist_iterator uiter;
+	struct ulist_node *root_node = NULL;
+	int ret;
+
+	if (!root->fs_info->quota_enabled)
+		return 1;
+
+	btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+	ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+				   tree_mod_seq_elem.seq, &roots);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+	ULIST_ITER_INIT(&uiter);
+	while ((root_node = ulist_next(roots, &uiter))) {
+		if (root_node->val == root->objectid) {
+			ret = 1;
+			break;
+		}
 	}
+	ulist_free(roots);
+out:
+	btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+	return ret;
+}
 
-	/* determine range to clone */
-	ret = -EINVAL;
-	if (off + len > src->i_size || off + len < off)
-		goto out_unlock;
-	if (len == 0)
-		olen = len = src->i_size - off;
-	/* if we extend to eof, continue to block boundary */
-	if (off + len == src->i_size)
-		len = ALIGN(src->i_size, bs) - off;
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+				     struct inode *inode,
+				     u64 endoff,
+				     const u64 destoff,
+				     const u64 olen)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
 
-	/* verify the end result is block aligned */
-	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-	    !IS_ALIGNED(destoff, bs))
-		goto out_unlock;
+	inode_inc_iversion(inode);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	/*
+	 * We round up to the block size at eof when determining which
+	 * extents to clone above, but shouldn't round up the file size.
+	 */
+	if (endoff > destoff + olen)
+		endoff = destoff + olen;
+	if (endoff > inode->i_size)
+		btrfs_i_size_write(inode, endoff);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
+	ret = btrfs_end_transaction(trans, root);
+out:
+	return ret;
+}
+
+static void clone_update_extent_map(struct inode *inode,
+				    const struct btrfs_trans_handle *trans,
+				    const struct btrfs_path *path,
+				    const u64 hole_offset,
+				    const u64 hole_len)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em) {
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+		return;
+	}
+
+	if (path) {
+		struct btrfs_file_extent_item *fi;
+
+		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
+		em->generation = -1;
+		if (btrfs_file_extent_type(path->nodes[0], fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	} else {
+		em->start = hole_offset;
+		em->len = hole_len;
+		em->ram_bytes = em->len;
+		em->orig_start = hole_offset;
+		em->block_start = EXTENT_MAP_HOLE;
+		em->block_len = 0;
+		em->orig_block_len = 0;
+		em->compress_type = BTRFS_COMPRESS_NONE;
+		em->generation = trans->transid;
+	}
 
-	/* do any pending delalloc/csum calc on src, one way or
-	   another, and lock file content */
 	while (1) {
-		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
-		if (!ordered &&
-		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
-				   EXTENT_DELALLOC, 0, NULL))
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em, 1);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
 			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		btrfs_wait_ordered_range(src, off, len);
+		}
+		btrfs_drop_extent_cache(inode, em->start,
+					em->start + em->len - 1, 0);
+	}
+
+	if (unlikely(ret))
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen, extent_same uses
+ *               identical values here
+ * @destoff: Offset within @inode to start clone
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       const u64 off, const u64 olen, const u64 olen_aligned,
+		       const u64 destoff)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
+	struct btrfs_trans_handle *trans;
+	char *buf = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int ret;
+	int no_quota;
+	const u64 len = olen_aligned;
+	u64 last_disko = 0;
+	u64 last_dest_end = destoff;
+
+	ret = -ENOMEM;
+	buf = vmalloc(btrfs_level_size(root, 0));
+	if (!buf)
+		return ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		return ret;
 	}
 
+	path->reada = 2;
 	/* clone data */
-	key.objectid = src->i_ino;
+	key.objectid = btrfs_ino(src);
 	key.type = BTRFS_EXTENT_DATA_KEY;
-	key.offset = 0;
+	key.offset = off;
 
 	while (1) {
 		/*
 		 * note the key will change type as we walk through the
 		 * tree.
 		 */
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		path->leave_spinning = 1;
+		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+				0, 0);
 		if (ret < 0)
 			goto out;
+		/*
+		 * First search, if no extent item that starts at offset off was
+		 * found but the previous item is an extent item, it's possible
+		 * it might overlap our target range, therefore process it.
+		 */
+		if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0] - 1);
+			if (key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+		no_quota = 1;
 		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
+			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
 			if (ret < 0)
 				goto out;
 			if (ret > 0)
@@ -1861,7 +3280,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
-		    key.objectid != src->i_ino)
+		    key.objectid != btrfs_ino(src))
 			break;
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
@@ -1872,12 +3291,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			u64 disko = 0, diskl = 0;
 			u64 datao = 0, datal = 0;
 			u8 comp;
-			u64 endoff;
-
-			size = btrfs_item_size_nr(leaf, slot);
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
+			u64 drop_start;
 
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
@@ -1897,20 +3311,52 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				datal = btrfs_file_extent_ram_bytes(leaf,
 								    extent);
 			}
-			btrfs_release_path(root, path);
 
-			if (key.offset + datal <= off ||
-			    key.offset >= off+len)
-				goto next;
+			/*
+			 * The first search might have left us at an extent
+			 * item that ends before our target range's start, can
+			 * happen if we have holes and NO_HOLES feature enabled.
+			 */
+			if (key.offset + datal <= off) {
+				path->slots[0]++;
+				goto process_slot;
+			} else if (key.offset >= off + len) {
+				break;
+			}
+
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+
+			btrfs_release_path(path);
+			path->leave_spinning = 0;
 
 			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = inode->i_ino;
+			new_key.objectid = btrfs_ino(inode);
 			if (off <= key.offset)
 				new_key.offset = key.offset + destoff - off;
 			else
 				new_key.offset = destoff;
 
-			trans = btrfs_start_transaction(root, 1);
+			/*
+			 * Deal with a hole that doesn't have an extent item
+			 * that represents it (NO_HOLES feature enabled).
+			 * This hole is either in the middle of the cloning
+			 * range or at the beginning (fully overlaps it or
+			 * partially overlaps it).
+			 */
+			if (new_key.offset != last_dest_end)
+				drop_start = last_dest_end;
+			else
+				drop_start = new_key.offset;
+
+			/*
+			 * 1 - adjusting old extent (we may have to split it)
+			 * 1 - add new extent
+			 * 1 - inode update
+			 */
+			trans = btrfs_start_transaction(root, 3);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
 				goto out;
@@ -1918,23 +3364,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 			if (type == BTRFS_FILE_EXTENT_REG ||
 			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				/*
+				 *    a  | --- range to clone ---|  b
+				 * | ------------- extent ------------- |
+				 */
+
+				/* subtract range b */
+				if (key.offset + datal > off + len)
+					datal = off + len - key.offset;
+
+				/* subtract range a */
 				if (off > key.offset) {
 					datao += off - key.offset;
 					datal -= off - key.offset;
 				}
 
-				if (key.offset + datal > off + len)
-					datal = off + len - key.offset;
-
-				ret = btrfs_drop_extents(trans, inode,
-							 new_key.offset,
+				ret = btrfs_drop_extents(trans, root, inode,
+							 drop_start,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
-				BUG_ON(ret);
+							 1);
+				if (ret) {
+					if (ret != -EOPNOTSUPP)
+						btrfs_abort_transaction(trans,
+								root, ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				ret = btrfs_insert_empty_item(trans, root, path,
 							      &new_key, size);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				leaf = path->nodes[0];
 				slot = path->slots[0];
@@ -1953,25 +3417,58 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 							     datao);
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								datal);
+
+				/*
+				 * We need to look up the roots that point at
+				 * this bytenr and see if the new root does.  If
+				 * it does not we need to make sure we update
+				 * quotas appropriately.
+				 */
+				if (disko && root != BTRFS_I(src)->root &&
+				    disko != last_disko) {
+					no_quota = check_ref(trans, root,
+							     disko);
+					if (no_quota < 0) {
+						btrfs_abort_transaction(trans,
+									root,
+									ret);
+						btrfs_end_transaction(trans,
+								      root);
+						ret = no_quota;
+						goto out;
+					}
+				}
+
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
 							disko, diskl, 0,
 							root->root_key.objectid,
-							inode->i_ino,
-							new_key.offset - datao);
-					BUG_ON(ret);
+							btrfs_ino(inode),
+							new_key.offset - datao,
+							no_quota);
+					if (ret) {
+						btrfs_abort_transaction(trans,
+									root,
+									ret);
+						btrfs_end_transaction(trans,
+								      root);
+						goto out;
+
+					}
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				u64 skip = 0;
 				u64 trim = 0;
+				u64 aligned_end = 0;
+
 				if (off > key.offset) {
 					skip = off - key.offset;
 					new_key.offset += skip;
 				}
 
-				if (key.offset + datal > off+len)
-					trim = key.offset + datal - (off+len);
+				if (key.offset + datal > off + len)
+					trim = key.offset + datal - (off + len);
 
 				if (comp && (skip || trim)) {
 					ret = -EINVAL;
@@ -1981,15 +3478,28 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				size -= skip + trim;
 				datal -= skip + trim;
 
-				ret = btrfs_drop_extents(trans, inode,
-							 new_key.offset,
-							 new_key.offset + datal,
-							 &hint_byte, 1);
-				BUG_ON(ret);
+				aligned_end = ALIGN(new_key.offset + datal,
+						    root->sectorsize);
+				ret = btrfs_drop_extents(trans, root, inode,
+							 drop_start,
+							 aligned_end,
+							 1);
+				if (ret) {
+					if (ret != -EOPNOTSUPP)
+						btrfs_abort_transaction(trans,
+							root, ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				ret = btrfs_insert_empty_item(trans, root, path,
 							      &new_key, size);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				if (skip) {
 					u32 start =
@@ -2006,44 +3516,224 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				inode_add_bytes(inode, datal);
 			}
 
-			btrfs_mark_buffer_dirty(leaf);
-			btrfs_release_path(root, path);
+			/* If we have an implicit hole (NO_HOLES feature). */
+			if (drop_start < new_key.offset)
+				clone_update_extent_map(inode, trans,
+						NULL, drop_start,
+						new_key.offset - drop_start);
 
-			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+			clone_update_extent_map(inode, trans, path, 0, 0);
 
-			/*
-			 * we round up to the block size at eof when
-			 * determining which extents to clone above,
-			 * but shouldn't round up the file size
-			 */
-			endoff = new_key.offset + datal;
-			if (endoff > destoff+olen)
-				endoff = destoff+olen;
-			if (endoff > inode->i_size)
-				btrfs_i_size_write(inode, endoff);
-
-			BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
-			ret = btrfs_update_inode(trans, root, inode);
-			BUG_ON(ret);
-			btrfs_end_transaction(trans, root);
+			btrfs_mark_buffer_dirty(leaf);
+			btrfs_release_path(path);
+
+			last_dest_end = new_key.offset + datal;
+			ret = clone_finish_inode_update(trans, inode,
+							last_dest_end,
+							destoff, olen);
+			if (ret)
+				goto out;
+			if (new_key.offset + datal >= destoff + len)
+				break;
 		}
-next:
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		key.offset++;
 	}
 	ret = 0;
+
+	if (last_dest_end < destoff + len) {
+		/*
+		 * We have an implicit hole (NO_HOLES feature is enabled) that
+		 * fully or partially overlaps our cloning range at its end.
+		 */
+		btrfs_release_path(path);
+
+		/*
+		 * 1 - remove extent(s)
+		 * 1 - inode update
+		 */
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+		ret = btrfs_drop_extents(trans, root, inode,
+					 last_dest_end, destoff + len, 1);
+		if (ret) {
+			if (ret != -EOPNOTSUPP)
+				btrfs_abort_transaction(trans, root, ret);
+			btrfs_end_transaction(trans, root);
+			goto out;
+		}
+		clone_update_extent_map(inode, trans, NULL, last_dest_end,
+					destoff + len - last_dest_end);
+		ret = clone_finish_inode_update(trans, inode, destoff + len,
+						destoff, olen);
+	}
+
 out:
-	btrfs_release_path(root, path);
-	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-out_unlock:
-	mutex_unlock(&src->i_mutex);
-	mutex_unlock(&inode->i_mutex);
-	vfree(buf);
 	btrfs_free_path(path);
+	vfree(buf);
+	return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+				       u64 off, u64 olen, u64 destoff)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct fd src_file;
+	struct inode *src;
+	int ret;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	int same_inode = 0;
+
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 *
+	 * - split destination inode's inline extents.  The inline extents can
+	 *   be either compressed or non-compressed.
+	 */
+
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+		return -EINVAL;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	ret = -EXDEV;
+	if (src_file.file->f_path.mnt != file->f_path.mnt)
+		goto out_fput;
+
+	src = file_inode(src_file.file);
+
+	ret = -EINVAL;
+	if (src == inode)
+		same_inode = 1;
+
+	/* the src must be open for reading */
+	if (!(src_file.file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+		goto out_fput;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+		} else {
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		}
+	} else {
+		mutex_lock(&src->i_mutex);
+	}
+
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off + len > src->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ALIGN(src->i_size, bs) - off;
+
+	/* verify the end result is block aligned */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+	    !IS_ALIGNED(destoff, bs))
+		goto out_unlock;
+
+	/* verify if ranges are overlapped within the same file */
+	if (same_inode) {
+		if (destoff + len > off && destoff < off + len)
+			goto out_unlock;
+	}
+
+	if (destoff > inode->i_size) {
+		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/*
+	 * Lock the target range too. Right after we replace the file extent
+	 * items in the fs tree (which now point to the cloned data), we might
+	 * have a worker replace them with extent items relative to a write
+	 * operation that was issued before this clone operation (i.e. confront
+	 * with inode.c:btrfs_finish_ordered_io).
+	 */
+	if (same_inode) {
+		u64 lock_start = min_t(u64, off, destoff);
+		u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
+
+		lock_extent_range(src, lock_start, lock_len);
+	} else {
+		lock_extent_range(src, off, len);
+		lock_extent_range(inode, destoff, len);
+	}
+
+	ret = btrfs_clone(src, inode, off, olen, len, destoff);
+
+	if (same_inode) {
+		u64 lock_start = min_t(u64, off, destoff);
+		u64 lock_end = max_t(u64, off, destoff) + len - 1;
+
+		unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
+	} else {
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
+			      destoff + len - 1);
+	}
+	/*
+	 * Truncate page cache pages so that future reads will see the cloned
+	 * data immediately and not the previous data.
+	 */
+	truncate_inode_pages_range(&inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len) - 1);
+out_unlock:
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_unlock(&src->i_mutex);
+			mutex_unlock(&inode->i_mutex);
+		} else {
+			mutex_unlock(&inode->i_mutex);
+			mutex_unlock(&src->i_mutex);
+		}
+	} else {
+		mutex_unlock(&src->i_mutex);
+	}
 out_fput:
-	fput(src_file);
+	fdput(src_file);
 out_drop_write:
-	mnt_drop_write(file->f_path.mnt);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -2065,7 +3755,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
  */
 static long btrfs_ioctl_trans_start(struct file *file)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -2082,16 +3772,14 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	if (btrfs_root_readonly(root))
 		goto out;
 
-	ret = mnt_want_write(file->f_path.mnt);
+	ret = mnt_want_write_file(file);
 	if (ret)
 		goto out;
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	root->fs_info->open_ioctl_trans++;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	atomic_inc(&root->fs_info->open_ioctl_trans);
 
 	ret = -ENOMEM;
-	trans = btrfs_start_ioctl_transaction(root, 0);
+	trans = btrfs_start_ioctl_transaction(root);
 	if (IS_ERR(trans))
 		goto out_drop;
 
@@ -2099,17 +3787,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	return 0;
 
 out_drop:
-	mutex_lock(&root->fs_info->trans_mutex);
-	root->fs_info->open_ioctl_trans--;
-	mutex_unlock(&root->fs_info->trans_mutex);
-	mnt_drop_write(file->f_path.mnt);
+	atomic_dec(&root->fs_info->open_ioctl_trans);
+	mnt_drop_write_file(file);
 out:
 	return ret;
 }
 
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
@@ -2117,51 +3803,59 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	struct btrfs_path *path;
 	struct btrfs_key location;
 	struct btrfs_disk_key disk_key;
-	struct btrfs_super_block *disk_super;
-	u64 features;
 	u64 objectid = 0;
 	u64 dir_id;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (copy_from_user(&objectid, argp, sizeof(objectid)))
-		return -EFAULT;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+		ret = -EFAULT;
+		goto out;
+	}
 
 	if (!objectid)
-		objectid = root->root_key.objectid;
+		objectid = BTRFS_FS_TREE_OBJECTID;
 
 	location.objectid = objectid;
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = (u64)-1;
 
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-	if (IS_ERR(new_root))
-		return PTR_ERR(new_root);
-
-	if (btrfs_root_refs(&new_root->root_item) == 0)
-		return -ENOENT;
+	if (IS_ERR(new_root)) {
+		ret = PTR_ERR(new_root);
+		goto out;
+	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	path->leave_spinning = 1;
 
 	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
-		return PTR_ERR(trans);
+		ret = PTR_ERR(trans);
+		goto out;
 	}
 
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
 				   dir_id, "default", 7, 1);
 	if (IS_ERR_OR_NULL(di)) {
 		btrfs_free_path(path);
 		btrfs_end_transaction(trans, root);
-		printk(KERN_ERR "Umm, you don't have the default dir item, "
-		       "this isn't going to work\n");
-		return -ENOENT;
+		btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
+			   "item, this isn't going to work");
+		ret = -ENOENT;
+		goto out;
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2169,19 +3863,15 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	disk_super = &root->fs_info->super_copy;
-	features = btrfs_super_incompat_flags(disk_super);
-	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
-		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
-		btrfs_set_super_incompat_flags(disk_super, features);
-	}
+	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
 	btrfs_end_transaction(trans, root);
-
-	return 0;
+out:
+	mnt_drop_write_file(file);
+	return ret;
 }
 
-static void get_block_group_info(struct list_head *groups_list,
-				 struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space)
 {
 	struct btrfs_block_group_cache *block_group;
 
@@ -2196,13 +3886,13 @@ static void get_block_group_info(struct list_head *groups_list,
 	}
 }
 
-long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_space_args space_args;
 	struct btrfs_ioctl_space_info space;
 	struct btrfs_ioctl_space_info *dest;
 	struct btrfs_ioctl_space_info *dest_orig;
-	struct btrfs_ioctl_space_info *user_dest;
+	struct btrfs_ioctl_space_info __user *user_dest;
 	struct btrfs_space_info *info;
 	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
 		       BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2244,6 +3934,11 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		up_read(&info->groups_sem);
 	}
 
+	/*
+	 * Global block reserve, exported as a space_info
+	 */
+	slot_count++;
+
 	/* space_slots == 0 means they are asking for a count */
 	if (space_args.space_slots == 0) {
 		space_args.total_spaces = slot_count;
@@ -2289,8 +3984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		down_read(&info->groups_sem);
 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
 			if (!list_empty(&info->block_groups[c])) {
-				get_block_group_info(&info->block_groups[c],
-						     &space);
+				btrfs_get_block_group_info(
+					&info->block_groups[c], &space);
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
@@ -2302,7 +3997,22 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		up_read(&info->groups_sem);
 	}
 
-	user_dest = (struct btrfs_ioctl_space_info *)
+	/*
+	 * Add global block reserve
+	 */
+	if (slot_count) {
+		struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+
+		spin_lock(&block_rsv->lock);
+		space.total_bytes = block_rsv->size;
+		space.used_bytes = block_rsv->size - block_rsv->reserved;
+		spin_unlock(&block_rsv->lock);
+		space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+		memcpy(dest, &space, sizeof(space));
+		space_args.total_spaces++;
+	}
+
+	user_dest = (struct btrfs_ioctl_space_info __user *)
 		(arg + sizeof(struct btrfs_ioctl_space_args));
 
 	if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -2324,7 +4034,7 @@ out:
  */
 long btrfs_ioctl_trans_end(struct file *file)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 
@@ -2335,35 +4045,44 @@ long btrfs_ioctl_trans_end(struct file *file)
 
 	btrfs_end_transaction(trans, root);
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	root->fs_info->open_ioctl_trans--;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	atomic_dec(&root->fs_info->open_ioctl_trans);
 
-	mnt_drop_write(file->f_path.mnt);
+	mnt_drop_write_file(file);
 	return 0;
 }
 
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+					    void __user *argp)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 transid;
+	int ret;
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
-	transid = trans->transid;
-	btrfs_commit_transaction_async(trans, root, 0);
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT)
+			return PTR_ERR(trans);
 
+		/* No running transaction, don't bother */
+		transid = root->fs_info->last_trans_committed;
+		goto out;
+	}
+	transid = trans->transid;
+	ret = btrfs_commit_transaction_async(trans, root, 0);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+out:
 	if (argp)
 		if (copy_to_user(argp, &transid, sizeof(transid)))
 			return -EFAULT;
 	return 0;
 }
 
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+					   void __user *argp)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
 	u64 transid;
 
 	if (argp) {
@@ -2375,10 +4094,1151 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
 	return btrfs_wait_for_commit(root, transid);
 }
 
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+		ret = mnt_want_write_file(file);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+			      0);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	if (!(sa->flags & BTRFS_SCRUB_READONLY))
+		mnt_drop_write_file(file);
+out:
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_scrub_cancel(root->fs_info);
+}
+
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+				       void __user *arg)
+{
+	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+				      void __user *arg)
+{
+	struct btrfs_ioctl_get_dev_stats *sa;
+	int ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+		kfree(sa);
+		return -EPERM;
+	}
+
+	ret = btrfs_get_dev_stats(root, sa);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_replace_args *p;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	p = memdup_user(arg, sizeof(*p));
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	switch (p->cmd) {
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+		if (root->fs_info->sb->s_flags & MS_RDONLY) {
+			ret = -EROFS;
+			goto out;
+		}
+		if (atomic_xchg(
+			&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+			ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		} else {
+			ret = btrfs_dev_replace_start(root, p);
+			atomic_set(
+			 &root->fs_info->mutually_exclusive_operation_running,
+			 0);
+		}
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+		btrfs_dev_replace_status(root->fs_info, p);
+		ret = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+		ret = btrfs_dev_replace_cancel(root->fs_info, p);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (copy_to_user(arg, p, sizeof(*p)))
+		ret = -EFAULT;
+out:
+	kfree(p);
+	return ret;
+}
+
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+	int ret = 0;
+	int i;
+	u64 rel_ptr;
+	int size;
+	struct btrfs_ioctl_ino_path_args *ipa = NULL;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_path *path;
+
+	if (!capable(CAP_DAC_READ_SEARCH))
+		return -EPERM;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ipa = memdup_user(arg, sizeof(*ipa));
+	if (IS_ERR(ipa)) {
+		ret = PTR_ERR(ipa);
+		ipa = NULL;
+		goto out;
+	}
+
+	size = min_t(u32, ipa->size, 4096);
+	ipath = init_ipath(size, root, path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto out;
+	}
+
+	ret = paths_from_inode(ipa->inum, ipath);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+		rel_ptr = ipath->fspath->val[i] -
+			  (u64)(unsigned long)ipath->fspath->val;
+		ipath->fspath->val[i] = rel_ptr;
+	}
+
+	ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+			   (void *)(unsigned long)ipath->fspath, size);
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+out:
+	btrfs_free_path(path);
+	free_ipath(ipath);
+	kfree(ipa);
+
+	return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct btrfs_data_container *inodes = ctx;
+	const size_t c = 3 * sizeof(u64);
+
+	if (inodes->bytes_left >= c) {
+		inodes->bytes_left -= c;
+		inodes->val[inodes->elem_cnt] = inum;
+		inodes->val[inodes->elem_cnt + 1] = offset;
+		inodes->val[inodes->elem_cnt + 2] = root;
+		inodes->elem_cnt += 3;
+	} else {
+		inodes->bytes_missing += c - inodes->bytes_left;
+		inodes->bytes_left = 0;
+		inodes->elem_missed += 3;
+	}
+
+	return 0;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+					void __user *arg)
+{
+	int ret = 0;
+	int size;
+	struct btrfs_ioctl_logical_ino_args *loi;
+	struct btrfs_data_container *inodes = NULL;
+	struct btrfs_path *path = NULL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	loi = memdup_user(arg, sizeof(*loi));
+	if (IS_ERR(loi)) {
+		ret = PTR_ERR(loi);
+		loi = NULL;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	size = min_t(u32, loi->size, 64 * 1024);
+	inodes = init_data_container(size);
+	if (IS_ERR(inodes)) {
+		ret = PTR_ERR(inodes);
+		inodes = NULL;
+		goto out;
+	}
+
+	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+					  build_ino_list, inodes);
+	if (ret == -EINVAL)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	ret = copy_to_user((void *)(unsigned long)loi->inodes,
+			   (void *)(unsigned long)inodes, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	btrfs_free_path(path);
+	vfree(inodes);
+	kfree(loi);
+
+	return ret;
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	bargs->flags = bctl->flags;
+
+	if (atomic_read(&fs_info->balance_running))
+		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+	if (atomic_read(&fs_info->balance_pause_req))
+		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+	if (atomic_read(&fs_info->balance_cancel_req))
+		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+	if (lock) {
+		spin_lock(&fs_info->balance_lock);
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+		spin_unlock(&fs_info->balance_lock);
+	} else {
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+	}
+}
+
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	struct btrfs_balance_control *bctl;
+	bool need_unlock; /* for mut. excl. ops lock */
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+again:
+	if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+		need_unlock = true;
+		goto locked;
+	}
+
+	/*
+	 * mut. excl. ops lock is locked.  Three possibilites:
+	 *   (1) some other op is running
+	 *   (2) balance is running
+	 *   (3) balance is paused -- special case (think resume)
+	 */
+	mutex_lock(&fs_info->balance_mutex);
+	if (fs_info->balance_ctl) {
+		/* this is either (2) or (3) */
+		if (!atomic_read(&fs_info->balance_running)) {
+			mutex_unlock(&fs_info->balance_mutex);
+			if (!mutex_trylock(&fs_info->volume_mutex))
+				goto again;
+			mutex_lock(&fs_info->balance_mutex);
+
+			if (fs_info->balance_ctl &&
+			    !atomic_read(&fs_info->balance_running)) {
+				/* this is (3) */
+				need_unlock = false;
+				goto locked;
+			}
+
+			mutex_unlock(&fs_info->balance_mutex);
+			mutex_unlock(&fs_info->volume_mutex);
+			goto again;
+		} else {
+			/* this is (2) */
+			mutex_unlock(&fs_info->balance_mutex);
+			ret = -EINPROGRESS;
+			goto out;
+		}
+	} else {
+		/* this is (1) */
+		mutex_unlock(&fs_info->balance_mutex);
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+locked:
+	BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+
+	if (arg) {
+		bargs = memdup_user(arg, sizeof(*bargs));
+		if (IS_ERR(bargs)) {
+			ret = PTR_ERR(bargs);
+			goto out_unlock;
+		}
+
+		if (bargs->flags & BTRFS_BALANCE_RESUME) {
+			if (!fs_info->balance_ctl) {
+				ret = -ENOTCONN;
+				goto out_bargs;
+			}
+
+			bctl = fs_info->balance_ctl;
+			spin_lock(&fs_info->balance_lock);
+			bctl->flags |= BTRFS_BALANCE_RESUME;
+			spin_unlock(&fs_info->balance_lock);
+
+			goto do_balance;
+		}
+	} else {
+		bargs = NULL;
+	}
+
+	if (fs_info->balance_ctl) {
+		ret = -EINPROGRESS;
+		goto out_bargs;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out_bargs;
+	}
+
+	bctl->fs_info = fs_info;
+	if (arg) {
+		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+		bctl->flags = bargs->flags;
+	} else {
+		/* balance everything - no filters */
+		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+	}
+
+do_balance:
+	/*
+	 * Ownership of bctl and mutually_exclusive_operation_running
+	 * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
+	 * or, if restriper was paused all the way until unmount, in
+	 * free_fs_info.  mutually_exclusive_operation_running is
+	 * cleared in __cancel_balance.
+	 */
+	need_unlock = false;
+
+	ret = btrfs_balance(bctl, bargs);
+
+	if (arg) {
+		if (copy_to_user(arg, bargs, sizeof(*bargs)))
+			ret = -EFAULT;
+	}
+
+out_bargs:
+	kfree(bargs);
+out_unlock:
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (need_unlock)
+		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case BTRFS_BALANCE_CTL_PAUSE:
+		return btrfs_pause_balance(root->fs_info);
+	case BTRFS_BALANCE_CTL_CANCEL:
+		return btrfs_cancel_balance(root->fs_info);
+	}
+
+	return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+					 void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+	if (!bargs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	update_ioctl_balance_args(fs_info, 1, bargs);
+
+	if (copy_to_user(arg, bargs, sizeof(*bargs)))
+		ret = -EFAULT;
+
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_ctl_args *sa;
+	struct btrfs_trans_handle *trans = NULL;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	down_write(&root->fs_info->subvol_sem);
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	switch (sa->cmd) {
+	case BTRFS_QUOTA_CTL_ENABLE:
+		ret = btrfs_quota_enable(trans, root->fs_info);
+		break;
+	case BTRFS_QUOTA_CTL_DISABLE:
+		ret = btrfs_quota_disable(trans, root->fs_info);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+	if (err && !ret)
+		ret = err;
+out:
+	kfree(sa);
+	up_write(&root->fs_info->subvol_sem);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_qgroup_assign_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->assign) {
+		ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	} else {
+		ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_qgroup_create_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	if (!sa->qgroupid) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->create) {
+		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+					  NULL);
+	} else {
+		ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_qgroup_limit_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+	u64 qgroupid;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	qgroupid = sa->qgroupid;
+	if (!qgroupid) {
+		/* take the current subvol as qgroup */
+		qgroupid = root->root_key.objectid;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	qsa = memdup_user(arg, sizeof(*qsa));
+	if (IS_ERR(qsa)) {
+		ret = PTR_ERR(qsa);
+		goto drop_write;
+	}
+
+	if (qsa->flags) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_qgroup_rescan(root->fs_info);
+
+out:
+	kfree(qsa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+	if (!qsa)
+		return -ENOMEM;
+
+	if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		qsa->flags = 1;
+		qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+	}
+
+	if (copy_to_user(arg, qsa, sizeof(*qsa)))
+		ret = -EFAULT;
+
+	kfree(qsa);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+					    struct btrfs_ioctl_received_subvol_args *sa)
+{
+	struct inode *inode = file_inode(file);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root_item *root_item = &root->root_item;
+	struct btrfs_trans_handle *trans;
+	struct timespec ct = CURRENT_TIME;
+	int ret = 0;
+	int received_uuid_changed;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret < 0)
+		return ret;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (btrfs_root_readonly(root)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * 1 - root item
+	 * 2 - uuid items (received uuid + subvol uuid)
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
+	sa->rtransid = trans->transid;
+	sa->rtime.sec = ct.tv_sec;
+	sa->rtime.nsec = ct.tv_nsec;
+
+	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
+				       BTRFS_UUID_SIZE);
+	if (received_uuid_changed &&
+	    !btrfs_is_empty_uuid(root_item->received_uuid))
+		btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				    root_item->received_uuid,
+				    BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+				    root->root_key.objectid);
+	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
+	btrfs_set_root_stransid(root_item, sa->stransid);
+	btrfs_set_root_rtransid(root_item, sa->rtransid);
+	btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
+	btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+	if (ret < 0) {
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
+	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
+		ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+					  sa->uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  root->root_key.objectid);
+		if (ret < 0 && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+out:
+	up_write(&root->fs_info->subvol_sem);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+						void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+	int ret = 0;
+
+	args32 = memdup_user(arg, sizeof(*args32));
+	if (IS_ERR(args32)) {
+		ret = PTR_ERR(args32);
+		args32 = NULL;
+		goto out;
+	}
+
+	args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+	if (!args64) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+	args64->stransid = args32->stransid;
+	args64->rtransid = args32->rtransid;
+	args64->stime.sec = args32->stime.sec;
+	args64->stime.nsec = args32->stime.nsec;
+	args64->rtime.sec = args32->rtime.sec;
+	args64->rtime.nsec = args32->rtime.nsec;
+	args64->flags = args32->flags;
+
+	ret = _btrfs_ioctl_set_received_subvol(file, args64);
+	if (ret)
+		goto out;
+
+	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+	args32->stransid = args64->stransid;
+	args32->rtransid = args64->rtransid;
+	args32->stime.sec = args64->stime.sec;
+	args32->stime.nsec = args64->stime.nsec;
+	args32->rtime.sec = args64->rtime.sec;
+	args32->rtime.nsec = args64->rtime.nsec;
+	args32->flags = args64->flags;
+
+	ret = copy_to_user(arg, args32, sizeof(*args32));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(args32);
+	kfree(args64);
+	return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+					    void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args *sa = NULL;
+	int ret = 0;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		sa = NULL;
+		goto out;
+	}
+
+	ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+	if (ret)
+		goto out;
+
+	ret = copy_to_user(arg, sa, sizeof(*sa));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(sa);
+	return ret;
+}
+
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	size_t len;
+	int ret;
+	char label[BTRFS_LABEL_SIZE];
+
+	spin_lock(&root->fs_info->super_lock);
+	memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+	spin_unlock(&root->fs_info->super_lock);
+
+	len = strnlen(label, BTRFS_LABEL_SIZE);
+
+	if (len == BTRFS_LABEL_SIZE) {
+		btrfs_warn(root->fs_info,
+			"label is too long, return the first %zu bytes", --len);
+	}
+
+	ret = copy_to_user(arg, label, len);
+
+	return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_trans_handle *trans;
+	char label[BTRFS_LABEL_SIZE];
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(label, arg, sizeof(label)))
+		return -EFAULT;
+
+	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+		btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
+		       BTRFS_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	spin_lock(&root->fs_info->super_lock);
+	strcpy(super_block->label, label);
+	spin_unlock(&root->fs_info->super_lock);
+	ret = btrfs_commit_transaction(trans, root);
+
+out_unlock:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+#define INIT_FEATURE_FLAGS(suffix) \
+	{ .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+	  .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+	  .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+
+static int btrfs_ioctl_get_supported_features(struct file *file,
+					      void __user *arg)
+{
+	static struct btrfs_ioctl_feature_flags features[3] = {
+		INIT_FEATURE_FLAGS(SUPP),
+		INIT_FEATURE_FLAGS(SAFE_SET),
+		INIT_FEATURE_FLAGS(SAFE_CLEAR)
+	};
+
+	if (copy_to_user(arg, &features, sizeof(features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_ioctl_feature_flags features;
+
+	features.compat_flags = btrfs_super_compat_flags(super_block);
+	features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+	features.incompat_flags = btrfs_super_incompat_flags(super_block);
+
+	if (copy_to_user(arg, &features, sizeof(features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int check_feature_bits(struct btrfs_root *root,
+			      enum btrfs_feature_set set,
+			      u64 change_mask, u64 flags, u64 supported_flags,
+			      u64 safe_set, u64 safe_clear)
+{
+	const char *type = btrfs_feature_set_names[set];
+	char *names;
+	u64 disallowed, unsupported;
+	u64 set_mask = flags & change_mask;
+	u64 clear_mask = ~flags & change_mask;
+
+	unsupported = set_mask & ~supported_flags;
+	if (unsupported) {
+		names = btrfs_printable_features(set, unsupported);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "this kernel does not support the %s feature bit%s",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
+			   "this kernel does not support %s bits 0x%llx",
+			   type, unsupported);
+		return -EOPNOTSUPP;
+	}
+
+	disallowed = set_mask & ~safe_set;
+	if (disallowed) {
+		names = btrfs_printable_features(set, disallowed);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "can't set the %s feature bit%s while mounted",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
+			   "can't set %s bits 0x%llx while mounted",
+			   type, disallowed);
+		return -EPERM;
+	}
+
+	disallowed = clear_mask & ~safe_clear;
+	if (disallowed) {
+		names = btrfs_printable_features(set, disallowed);
+		if (names) {
+			btrfs_warn(root->fs_info,
+			   "can't clear the %s feature bit%s while mounted",
+			   names, strchr(names, ',') ? "s" : "");
+			kfree(names);
+		} else
+			btrfs_warn(root->fs_info,
+			   "can't clear %s bits 0x%llx while mounted",
+			   type, disallowed);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+#define check_feature(root, change_mask, flags, mask_base)	\
+check_feature_bits(root, FEAT_##mask_base, change_mask, flags,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SUPP,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,	\
+		   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_ioctl_feature_flags flags[2];
+	struct btrfs_trans_handle *trans;
+	u64 newflags;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	/* Nothing to do */
+	if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+	    !flags[0].incompat_flags)
+		return 0;
+
+	ret = check_feature(root, flags[0].compat_flags,
+			    flags[1].compat_flags, COMPAT);
+	if (ret)
+		return ret;
+
+	ret = check_feature(root, flags[0].compat_ro_flags,
+			    flags[1].compat_ro_flags, COMPAT_RO);
+	if (ret)
+		return ret;
+
+	ret = check_feature(root, flags[0].incompat_flags,
+			    flags[1].incompat_flags, INCOMPAT);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&root->fs_info->super_lock);
+	newflags = btrfs_super_compat_flags(super_block);
+	newflags |= flags[0].compat_flags & flags[1].compat_flags;
+	newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+	btrfs_set_super_compat_flags(super_block, newflags);
+
+	newflags = btrfs_super_compat_ro_flags(super_block);
+	newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+	newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+	btrfs_set_super_compat_ro_flags(super_block, newflags);
+
+	newflags = btrfs_super_incompat_flags(super_block);
+	newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+	newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+	btrfs_set_super_incompat_flags(super_block, newflags);
+	spin_unlock(&root->fs_info->super_lock);
+
+	return btrfs_commit_transaction(trans, root);
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
@@ -2388,12 +5248,16 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_setflags(file, argp);
 	case FS_IOC_GETVERSION:
 		return btrfs_ioctl_getversion(file, argp);
+	case FITRIM:
+		return btrfs_ioctl_fitrim(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SNAP_CREATE_V2:
 		return btrfs_ioctl_snap_create_v2(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_SUBVOL_CREATE_V2:
+		return btrfs_ioctl_snap_create_v2(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -2407,13 +5271,17 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_DEFRAG_RANGE:
 		return btrfs_ioctl_defrag(file, argp);
 	case BTRFS_IOC_RESIZE:
-		return btrfs_ioctl_resize(root, argp);
+		return btrfs_ioctl_resize(file, argp);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, argp);
 	case BTRFS_IOC_RM_DEV:
-		return btrfs_ioctl_rm_dev(root, argp);
+		return btrfs_ioctl_rm_dev(file, argp);
+	case BTRFS_IOC_FS_INFO:
+		return btrfs_ioctl_fs_info(root, argp);
+	case BTRFS_IOC_DEV_INFO:
+		return btrfs_ioctl_dev_info(root, argp);
 	case BTRFS_IOC_BALANCE:
-		return btrfs_balance(root->fs_info->dev_root);
+		return btrfs_ioctl_balance(file, NULL);
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
@@ -2424,17 +5292,79 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_TREE_SEARCH:
 		return btrfs_ioctl_tree_search(file, argp);
+	case BTRFS_IOC_TREE_SEARCH_V2:
+		return btrfs_ioctl_tree_search_v2(file, argp);
 	case BTRFS_IOC_INO_LOOKUP:
 		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_INO_PATHS:
+		return btrfs_ioctl_ino_to_path(root, argp);
+	case BTRFS_IOC_LOGICAL_INO:
+		return btrfs_ioctl_logical_to_ino(root, argp);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
-	case BTRFS_IOC_SYNC:
-		btrfs_sync_fs(file->f_dentry->d_sb, 1);
-		return 0;
+	case BTRFS_IOC_SYNC: {
+		int ret;
+
+		ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+		if (ret)
+			return ret;
+		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return ret;
+	}
 	case BTRFS_IOC_START_SYNC:
-		return btrfs_ioctl_start_sync(file, argp);
+		return btrfs_ioctl_start_sync(root, argp);
 	case BTRFS_IOC_WAIT_SYNC:
-		return btrfs_ioctl_wait_sync(file, argp);
+		return btrfs_ioctl_wait_sync(root, argp);
+	case BTRFS_IOC_SCRUB:
+		return btrfs_ioctl_scrub(file, argp);
+	case BTRFS_IOC_SCRUB_CANCEL:
+		return btrfs_ioctl_scrub_cancel(root, argp);
+	case BTRFS_IOC_SCRUB_PROGRESS:
+		return btrfs_ioctl_scrub_progress(root, argp);
+	case BTRFS_IOC_BALANCE_V2:
+		return btrfs_ioctl_balance(file, argp);
+	case BTRFS_IOC_BALANCE_CTL:
+		return btrfs_ioctl_balance_ctl(root, arg);
+	case BTRFS_IOC_BALANCE_PROGRESS:
+		return btrfs_ioctl_balance_progress(root, argp);
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
+		return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+		return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
+	case BTRFS_IOC_SEND:
+		return btrfs_ioctl_send(file, argp);
+	case BTRFS_IOC_GET_DEV_STATS:
+		return btrfs_ioctl_get_dev_stats(root, argp);
+	case BTRFS_IOC_QUOTA_CTL:
+		return btrfs_ioctl_quota_ctl(file, argp);
+	case BTRFS_IOC_QGROUP_ASSIGN:
+		return btrfs_ioctl_qgroup_assign(file, argp);
+	case BTRFS_IOC_QGROUP_CREATE:
+		return btrfs_ioctl_qgroup_create(file, argp);
+	case BTRFS_IOC_QGROUP_LIMIT:
+		return btrfs_ioctl_qgroup_limit(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN:
+		return btrfs_ioctl_quota_rescan(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+		return btrfs_ioctl_quota_rescan_status(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+		return btrfs_ioctl_quota_rescan_wait(file, argp);
+	case BTRFS_IOC_DEV_REPLACE:
+		return btrfs_ioctl_dev_replace(root, argp);
+	case BTRFS_IOC_GET_FSLABEL:
+		return btrfs_ioctl_get_fslabel(file, argp);
+	case BTRFS_IOC_SET_FSLABEL:
+		return btrfs_ioctl_set_fslabel(file, argp);
+	case BTRFS_IOC_FILE_EXTENT_SAME:
+		return btrfs_ioctl_file_extent_same(file, argp);
+	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+		return btrfs_ioctl_get_supported_features(file, argp);
+	case BTRFS_IOC_GET_FEATURES:
+		return btrfs_ioctl_get_features(file, argp);
+	case BTRFS_IOC_SET_FEATURES:
+		return btrfs_ioctl_set_features(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index 8fb382167b1..00000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __IOCTL_
-#define __IOCTL_
-#include <linux/ioctl.h>
-
-#define BTRFS_IOCTL_MAGIC 0x94
-#define BTRFS_VOL_NAME_MAX 255
-
-/* this should be 4k */
-#define BTRFS_PATH_NAME_MAX 4087
-struct btrfs_ioctl_vol_args {
-	__s64 fd;
-	char name[BTRFS_PATH_NAME_MAX + 1];
-};
-
-#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
-
-#define BTRFS_SUBVOL_NAME_MAX 4039
-struct btrfs_ioctl_vol_args_v2 {
-	__s64 fd;
-	__u64 transid;
-	__u64 flags;
-	__u64 unused[4];
-	char name[BTRFS_SUBVOL_NAME_MAX + 1];
-};
-
-#define BTRFS_INO_LOOKUP_PATH_MAX 4080
-struct btrfs_ioctl_ino_lookup_args {
-	__u64 treeid;
-	__u64 objectid;
-	char name[BTRFS_INO_LOOKUP_PATH_MAX];
-};
-
-struct btrfs_ioctl_search_key {
-	/* which root are we searching.  0 is the tree of tree roots */
-	__u64 tree_id;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_objectid;
-	__u64 max_objectid;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_offset;
-	__u64 max_offset;
-
-	/* max and min transids to search for */
-	__u64 min_transid;
-	__u64 max_transid;
-
-	/* keys returned will be >= min and <= max */
-	__u32 min_type;
-	__u32 max_type;
-
-	/*
-	 * how many items did userland ask for, and how many are we
-	 * returning
-	 */
-	__u32 nr_items;
-
-	/* align to 64 bits */
-	__u32 unused;
-
-	/* some extra for later */
-	__u64 unused1;
-	__u64 unused2;
-	__u64 unused3;
-	__u64 unused4;
-};
-
-struct btrfs_ioctl_search_header {
-	__u64 transid;
-	__u64 objectid;
-	__u64 offset;
-	__u32 type;
-	__u32 len;
-};
-
-#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
-/*
- * the buf is an array of search headers where
- * each header is followed by the actual item
- * the type field is expanded to 32 bits for alignment
- */
-struct btrfs_ioctl_search_args {
-	struct btrfs_ioctl_search_key key;
-	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
-};
-
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
-
-/* flags for the defrag range ioctl */
-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
-#define BTRFS_DEFRAG_RANGE_START_IO 2
-
-struct btrfs_ioctl_defrag_range_args {
-	/* start of the defrag operation */
-	__u64 start;
-
-	/* number of bytes to defrag, use (u64)-1 to say all */
-	__u64 len;
-
-	/*
-	 * flags for the operation, which can include turning
-	 * on compression for this one defrag
-	 */
-	__u64 flags;
-
-	/*
-	 * any extent bigger than this will be considered
-	 * already defragged.  Use 0 to take the kernel default
-	 * Use 1 to say every single extent must be rewritten
-	 */
-	__u32 extent_thresh;
-
-	/*
-	 * which compression method to use if turning on compression
-	 * for this defrag operation.  If unspecified, zlib will
-	 * be used
-	 */
-	__u32 compress_type;
-
-	/* spare for later */
-	__u32 unused[4];
-};
-
-struct btrfs_ioctl_space_info {
-	__u64 flags;
-	__u64 total_bytes;
-	__u64 used_bytes;
-};
-
-struct btrfs_ioctl_space_args {
-	__u64 space_slots;
-	__u64 total_spaces;
-	struct btrfs_ioctl_space_info spaces[0];
-};
-
-#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
-				   struct btrfs_ioctl_vol_args)
-/* trans start and trans end are dangerous, and only for
- * use by applications that know how to avoid the
- * resulting deadlocks
- */
-#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
-#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
-#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
-
-#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
-				   struct btrfs_ioctl_vol_args)
-
-#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
-				  struct btrfs_ioctl_clone_range_args)
-
-#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
-				struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
-				struct btrfs_ioctl_defrag_range_args)
-#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
-				   struct btrfs_ioctl_search_args)
-#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
-				   struct btrfs_ioctl_ino_lookup_args)
-#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
-#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
-				    struct btrfs_ioctl_space_args)
-#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
-#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-				   struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
-#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
-#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6151f2ea38b..5665d214924 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,210 +24,259 @@
 #include "extent_io.h"
 #include "locking.h"
 
-static inline void spin_nested(struct extent_buffer *eb)
-{
-	spin_lock(&eb->lock);
-}
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
 
 /*
- * Setting a lock to blocking will drop the spinlock and set the
- * flag that forces other procs who want the lock to wait.  After
- * this you can safely schedule with the lock held.
+ * if we currently have a spinning reader or writer lock
+ * (indicated by the rw flag) this will bump the count
+ * of blocking holders and drop the spinlock.
  */
-void btrfs_set_lock_blocking(struct extent_buffer *eb)
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
-		spin_unlock(&eb->lock);
+	/*
+	 * no lock is required.  The lock owner may change if
+	 * we have a read lock, but it won't change to or away
+	 * from us.  If we have the write lock, we are the owner
+	 * and it'll never change.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner)
+		return;
+	if (rw == BTRFS_WRITE_LOCK) {
+		if (atomic_read(&eb->blocking_writers) == 0) {
+			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+			atomic_dec(&eb->spinning_writers);
+			btrfs_assert_tree_locked(eb);
+			atomic_inc(&eb->blocking_writers);
+			write_unlock(&eb->lock);
+		}
+	} else if (rw == BTRFS_READ_LOCK) {
+		btrfs_assert_tree_read_locked(eb);
+		atomic_inc(&eb->blocking_readers);
+		WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+		atomic_dec(&eb->spinning_readers);
+		read_unlock(&eb->lock);
 	}
-	/* exit with the spin lock released and the bit set */
+	return;
 }
 
 /*
- * clearing the blocking flag will take the spinlock again.
- * After this you can't safely schedule
+ * if we currently have a blocking lock, take the spinlock
+ * and drop our blocking count
  */
-void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-		spin_nested(eb);
-		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
-		smp_mb__after_clear_bit();
+	/*
+	 * no lock is required.  The lock owner may change if
+	 * we have a read lock, but it won't change to or away
+	 * from us.  If we have the write lock, we are the owner
+	 * and it'll never change.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner)
+		return;
+
+	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
+		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+		write_lock(&eb->lock);
+		WARN_ON(atomic_read(&eb->spinning_writers));
+		atomic_inc(&eb->spinning_writers);
+		if (atomic_dec_and_test(&eb->blocking_writers) &&
+		    waitqueue_active(&eb->write_lock_wq))
+			wake_up(&eb->write_lock_wq);
+	} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
+		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+		read_lock(&eb->lock);
+		atomic_inc(&eb->spinning_readers);
+		if (atomic_dec_and_test(&eb->blocking_readers) &&
+		    waitqueue_active(&eb->read_lock_wq))
+			wake_up(&eb->read_lock_wq);
 	}
-	/* exit with the spin lock held */
+	return;
 }
 
 /*
- * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for very long, and often they don't block
- * at all.  For a dbench 50 run, if we don't spin on the blocking bit
- * at all, the context switch rate can jump up to 400,000/sec or more.
- *
- * So, we're still stuck with this crummy spin on the blocking bit,
- * at least until the most common causes of the short blocks
- * can be dealt with.
+ * take a spinning read lock.  This will wait for any blocking
+ * writers
  */
-static int btrfs_spin_on_block(struct extent_buffer *eb)
+void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
-	int i;
-
-	for (i = 0; i < 512; i++) {
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		if (need_resched())
-			break;
-		cpu_relax();
+again:
+	BUG_ON(!atomic_read(&eb->blocking_writers) &&
+	       current->pid == eb->lock_owner);
+
+	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers) &&
+	    current->pid == eb->lock_owner) {
+		/*
+		 * This extent is already write-locked by our thread. We allow
+		 * an additional read lock to be added because it's for the same
+		 * thread. btrfs_find_all_roots() depends on this as it may be
+		 * called on a partly (write-)locked tree.
+		 */
+		BUG_ON(eb->lock_nested);
+		eb->lock_nested = 1;
+		read_unlock(&eb->lock);
+		return;
 	}
-	return 0;
+	if (atomic_read(&eb->blocking_writers)) {
+		read_unlock(&eb->lock);
+		wait_event(eb->write_lock_wq,
+			   atomic_read(&eb->blocking_writers) == 0);
+		goto again;
+	}
+	atomic_inc(&eb->read_locks);
+	atomic_inc(&eb->spinning_readers);
 }
 
 /*
- * This is somewhat different from trylock.  It will take the
- * spinlock but if it finds the lock is set to blocking, it will
- * return without the lock held.
- *
- * returns 1 if it was able to take the lock and zero otherwise
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
  */
-int btrfs_try_spin_lock(struct extent_buffer *eb)
+int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 {
-	int i;
+	if (atomic_read(&eb->blocking_writers))
+		return 0;
 
-	if (btrfs_spin_on_block(eb)) {
-		spin_nested(eb);
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		spin_unlock(&eb->lock);
-	}
-	/* spin for a bit on the BLOCKING flag */
-	for (i = 0; i < 2; i++) {
-		cpu_relax();
-		if (!btrfs_spin_on_block(eb))
-			break;
-
-		spin_nested(eb);
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 1;
-		spin_unlock(&eb->lock);
+	if (!read_trylock(&eb->lock))
+		return 0;
+
+	if (atomic_read(&eb->blocking_writers)) {
+		read_unlock(&eb->lock);
+		return 0;
 	}
-	return 0;
+	atomic_inc(&eb->read_locks);
+	atomic_inc(&eb->spinning_readers);
+	return 1;
 }
 
 /*
- * the autoremove wake function will return 0 if it tried to wake up
- * a process that was already awake, which means that process won't
- * count as an exclusive wakeup.  The waitq code will continue waking
- * procs until it finds one that was actually sleeping.
- *
- * For btrfs, this isn't quite what we want.  We want a single proc
- * to be notified that the lock is ready for taking.  If that proc
- * already happen to be awake, great, it will loop around and try for
- * the lock.
- *
- * So, btrfs_wake_function always returns 1, even when the proc that we
- * tried to wake up was already awake.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers or readers
  */
-static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
-			       int sync, void *key)
+int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 {
-	autoremove_wake_function(wait, mode, sync, key);
+	if (atomic_read(&eb->blocking_writers) ||
+	    atomic_read(&eb->blocking_readers))
+		return 0;
+
+	if (!write_trylock(&eb->lock))
+		return 0;
+
+	if (atomic_read(&eb->blocking_writers) ||
+	    atomic_read(&eb->blocking_readers)) {
+		write_unlock(&eb->lock);
+		return 0;
+	}
+	atomic_inc(&eb->write_locks);
+	atomic_inc(&eb->spinning_writers);
+	eb->lock_owner = current->pid;
 	return 1;
 }
 
 /*
- * returns with the extent buffer spinlocked.
- *
- * This will spin and/or wait as required to take the lock, and then
- * return with the spinlock held.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
+ * drop a spinning read lock
  */
-int btrfs_tree_lock(struct extent_buffer *eb)
+void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
-	DEFINE_WAIT(wait);
-	wait.func = btrfs_wake_function;
-
-	if (!btrfs_spin_on_block(eb))
-		goto sleep;
-
-	while(1) {
-		spin_nested(eb);
-
-		/* nobody is blocking, exit with the spinlock held */
-		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			return 0;
-
-		/*
-		 * we have the spinlock, but the real owner is blocking.
-		 * wait for them
-		 */
-		spin_unlock(&eb->lock);
-
-		/*
-		 * spin for a bit, and if the blocking flag goes away,
-		 * loop around
-		 */
-		cpu_relax();
-		if (btrfs_spin_on_block(eb))
-			continue;
-sleep:
-		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
-					  TASK_UNINTERRUPTIBLE);
-
-		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-			schedule();
+	/*
+	 * if we're nested, we have the write lock.  No new locking
+	 * is needed as long as we are the lock owner.
+	 * The write unlock will do a barrier for us, and the lock_nested
+	 * field only matters to the lock owner.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner) {
+		eb->lock_nested = 0;
+		return;
+	}
+	btrfs_assert_tree_read_locked(eb);
+	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+	atomic_dec(&eb->spinning_readers);
+	atomic_dec(&eb->read_locks);
+	read_unlock(&eb->lock);
+}
 
-		finish_wait(&eb->lock_wq, &wait);
+/*
+ * drop a blocking read lock
+ */
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
+{
+	/*
+	 * if we're nested, we have the write lock.  No new locking
+	 * is needed as long as we are the lock owner.
+	 * The write unlock will do a barrier for us, and the lock_nested
+	 * field only matters to the lock owner.
+	 */
+	if (eb->lock_nested && current->pid == eb->lock_owner) {
+		eb->lock_nested = 0;
+		return;
 	}
-	return 0;
+	btrfs_assert_tree_read_locked(eb);
+	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+	if (atomic_dec_and_test(&eb->blocking_readers) &&
+	    waitqueue_active(&eb->read_lock_wq))
+		wake_up(&eb->read_lock_wq);
+	atomic_dec(&eb->read_locks);
 }
 
 /*
- * Very quick trylock, this does not spin or schedule.  It returns
- * 1 with the spinlock held if it was able to take the lock, or it
- * returns zero if it was unable to take the lock.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
+ * take a spinning write lock.  This will wait for both
+ * blocking readers or writers
  */
-int btrfs_try_tree_lock(struct extent_buffer *eb)
+void btrfs_tree_lock(struct extent_buffer *eb)
 {
-	if (spin_trylock(&eb->lock)) {
-		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-			/*
-			 * we've got the spinlock, but the real owner is
-			 * blocking.  Drop the spinlock and return failure
-			 */
-			spin_unlock(&eb->lock);
-			return 0;
-		}
-		return 1;
+again:
+	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
+	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+	write_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_readers)) {
+		write_unlock(&eb->lock);
+		wait_event(eb->read_lock_wq,
+			   atomic_read(&eb->blocking_readers) == 0);
+		goto again;
 	}
-	/* someone else has the spinlock giveup */
-	return 0;
+	if (atomic_read(&eb->blocking_writers)) {
+		write_unlock(&eb->lock);
+		wait_event(eb->write_lock_wq,
+			   atomic_read(&eb->blocking_writers) == 0);
+		goto again;
+	}
+	WARN_ON(atomic_read(&eb->spinning_writers));
+	atomic_inc(&eb->spinning_writers);
+	atomic_inc(&eb->write_locks);
+	eb->lock_owner = current->pid;
 }
 
-int btrfs_tree_unlock(struct extent_buffer *eb)
+/*
+ * drop a spinning or a blocking write lock.
+ */
+void btrfs_tree_unlock(struct extent_buffer *eb)
 {
-	/*
-	 * if we were a blocking owner, we don't have the spinlock held
-	 * just clear the bit and look for waiters
-	 */
-	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-		smp_mb__after_clear_bit();
-	else
-		spin_unlock(&eb->lock);
-
-	if (waitqueue_active(&eb->lock_wq))
-		wake_up(&eb->lock_wq);
-	return 0;
+	int blockers = atomic_read(&eb->blocking_writers);
+
+	BUG_ON(blockers > 1);
+
+	btrfs_assert_tree_locked(eb);
+	eb->lock_owner = 0;
+	atomic_dec(&eb->write_locks);
+
+	if (blockers) {
+		WARN_ON(atomic_read(&eb->spinning_writers));
+		atomic_dec(&eb->blocking_writers);
+		smp_mb();
+		if (waitqueue_active(&eb->write_lock_wq))
+			wake_up(&eb->write_lock_wq);
+	} else {
+		WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+		atomic_dec(&eb->spinning_writers);
+		write_unlock(&eb->lock);
+	}
 }
 
 void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-		assert_spin_locked(&eb->lock);
+	BUG_ON(!atomic_read(&eb->write_locks));
+}
+
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+	BUG_ON(!atomic_read(&eb->read_locks));
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6c4ce457168..b81e0e9a489 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,13 +19,42 @@
 #ifndef __BTRFS_LOCKING_
 #define __BTRFS_LOCKING_
 
-int btrfs_tree_lock(struct extent_buffer *eb);
-int btrfs_tree_unlock(struct extent_buffer *eb);
+#define BTRFS_WRITE_LOCK 1
+#define BTRFS_READ_LOCK 2
+#define BTRFS_WRITE_LOCK_BLOCKING 3
+#define BTRFS_READ_LOCK_BLOCKING 4
 
-int btrfs_try_tree_lock(struct extent_buffer *eb);
-int btrfs_try_spin_lock(struct extent_buffer *eb);
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
 
-void btrfs_set_lock_blocking(struct extent_buffer *eb);
-void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+
+static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+{
+	if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+		btrfs_tree_unlock(eb);
+	else if (rw == BTRFS_READ_LOCK_BLOCKING)
+		btrfs_tree_read_unlock_blocking(eb);
+	else if (rw == BTRFS_READ_LOCK)
+		btrfs_tree_read_unlock(eb);
+	else
+		BUG();
+}
+
+static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+	btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
+}
+
+static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+	btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
+}
 #endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a178f5ebea7..dfad8514f0d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
 
 struct workspace {
 	void *mem;
-	void *buf;	/* where compressed data goes */
-	void *cbuf;	/* where decompressed data goes */
+	void *buf;	/* where decompressed data goes */
+	void *cbuf;	/* where compressed data goes */
 	struct list_head list;
 };
 
@@ -141,9 +141,9 @@ static int lzo_compress_pages(struct list_head *ws,
 		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
 				       &out_len, workspace->mem);
 		if (ret != LZO_E_OK) {
-			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
 			       ret);
-			ret = -1;
+			ret = -EIO;
 			goto out;
 		}
 
@@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,
 				kunmap(out_page);
 				if (nr_pages == nr_dest_pages) {
 					out_page = NULL;
-					ret = -1;
+					ret = -E2BIG;
 					goto out;
 				}
 
@@ -207,8 +207,10 @@ static int lzo_compress_pages(struct list_head *ws,
 		}
 
 		/* we're making it bigger, give up */
-		if (tot_in > 8192 && tot_in < tot_out)
+		if (tot_in > 8192 && tot_in < tot_out) {
+			ret = -E2BIG;
 			goto out;
+		}
 
 		/* we're all done */
 		if (tot_in >= len)
@@ -333,7 +335,7 @@ cont:
 					break;
 
 				if (page_in_index + 1 >= total_pages_in) {
-					ret = -1;
+					ret = -EIO;
 					goto done;
 				}
 
@@ -355,8 +357,8 @@ cont:
 		if (need_unmap)
 			kunmap(pages_in[page_in_index - 1]);
 		if (ret != LZO_E_OK) {
-			printk(KERN_WARNING "btrfs decompress failed\n");
-			ret = -1;
+			printk(KERN_WARNING "BTRFS: decompress failed\n");
+			ret = -EIO;
 			break;
 		}
 
@@ -399,21 +401,21 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	out_len = PAGE_CACHE_SIZE;
 	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
 	if (ret != LZO_E_OK) {
-		printk(KERN_WARNING "btrfs decompress failed!\n");
-		ret = -1;
+		printk(KERN_WARNING "BTRFS: decompress failed!\n");
+		ret = -EIO;
 		goto out;
 	}
 
 	if (out_len < start_byte) {
-		ret = -1;
+		ret = -EIO;
 		goto out;
 	}
 
 	bytes = min_t(unsigned long, destlen, out_len - start_byte);
 
-	kaddr = kmap_atomic(dest_page, KM_USER0);
+	kaddr = kmap_atomic(dest_page);
 	memcpy(kaddr, workspace->buf + start_byte, bytes);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 out:
 	return ret;
 }
diff --git a/fs/ocfs2/cluster/ver.h b/fs/btrfs/math.h
index 32554c3382c..b7816cefbd1 100644
--- a/fs/ocfs2/cluster/ver.h
+++ b/fs/btrfs/math.h
@@ -1,16 +1,11 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
+
+/*
+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License v2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -23,9 +18,27 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
 
-void cluster_print_version(void);
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
 
-#endif /* O2CLUSTER_VER_H */
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 083a5547737..7187b14faa6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,9 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "extent_io.h"
+#include "disk-io.h"
+
+static struct kmem_cache *btrfs_ordered_extent_cache;
 
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
@@ -59,6 +62,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 	return NULL;
 }
 
+static void ordered_data_tree_panic(struct inode *inode, int errno,
+					       u64 offset)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
+		    "%llu", offset);
+}
+
 /*
  * look for a given offset in the tree, and if it can't be found return the
  * first lesser offset
@@ -174,22 +185,27 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
 				      int type, int dio, int compress_type)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	entry = kzalloc(sizeof(*entry), GFP_NOFS);
+	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+	    !(type == BTRFS_ORDERED_NOCOW))
+		entry->csum_bytes_left = disk_len;
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
-	entry->inode = inode;
+	entry->inode = igrab(inode);
 	entry->compress_type = compress_type;
+	entry->truncated_len = (u64)-1;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
@@ -201,19 +217,32 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	init_waitqueue_head(&entry->wait);
 	INIT_LIST_HEAD(&entry->list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
+	INIT_LIST_HEAD(&entry->work_list);
+	init_completion(&entry->completion);
+	INIT_LIST_HEAD(&entry->log_list);
 
-	spin_lock(&tree->lock);
+	trace_btrfs_ordered_extent_add(inode, entry);
+
+	spin_lock_irq(&tree->lock);
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
-	BUG_ON(node);
-	spin_unlock(&tree->lock);
+	if (node)
+		ordered_data_tree_panic(inode, -EEXIST, file_offset);
+	spin_unlock_irq(&tree->lock);
 
-	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_lock(&root->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
-		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
-	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+		      &root->ordered_extents);
+	root->nr_ordered_extents++;
+	if (root->nr_ordered_extents == 1) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(!list_empty(&root->ordered_root));
+		list_add_tail(&root->ordered_root,
+			      &root->fs_info->ordered_roots);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
 
-	BUG_ON(node);
 	return 0;
 }
 
@@ -247,17 +276,20 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
  * when an ordered extent is finished.  If the list covers more than one
  * ordered extent, it is split across multiples.
  */
-int btrfs_add_ordered_sum(struct inode *inode,
-			  struct btrfs_ordered_extent *entry,
-			  struct btrfs_ordered_sum *sum)
+void btrfs_add_ordered_sum(struct inode *inode,
+			   struct btrfs_ordered_extent *entry,
+			   struct btrfs_ordered_sum *sum)
 {
 	struct btrfs_ordered_inode_tree *tree;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
-	spin_unlock(&tree->lock);
-	return 0;
+	WARN_ON(entry->csum_bytes_left < sum->len);
+	entry->csum_bytes_left -= sum->len;
+	if (entry->csum_bytes_left == 0)
+		wake_up(&entry->wait);
+	spin_unlock_irq(&tree->lock);
 }
 
 /*
@@ -274,18 +306,19 @@ int btrfs_add_ordered_sum(struct inode *inode,
  */
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 *file_offset, u64 io_size)
+				   u64 *file_offset, u64 io_size, int uptodate)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	int ret;
+	unsigned long flags;
 	u64 dec_end;
 	u64 dec_start;
 	u64 to_dec;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irqsave(&tree->lock, flags);
 	node = tree_search(tree, *file_offset);
 	if (!node) {
 		ret = 1;
@@ -303,27 +336,32 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 		      entry->len);
 	*file_offset = dec_end;
 	if (dec_start > dec_end) {
-		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
-		       (unsigned long long)dec_start,
-		       (unsigned long long)dec_end);
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			"bad ordering dec_start %llu end %llu", dec_start, dec_end);
 	}
 	to_dec = dec_end - dec_start;
 	if (to_dec > entry->bytes_left) {
-		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       (unsigned long long)entry->bytes_left,
-		       (unsigned long long)to_dec);
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			"bad ordered accounting left %llu size %llu",
+			entry->bytes_left, to_dec);
 	}
 	entry->bytes_left -= to_dec;
-	if (entry->bytes_left == 0)
+	if (!uptodate)
+		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-	else
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
 		ret = 1;
+	}
 out:
 	if (!ret && cached && entry) {
 		*cached = entry;
 		atomic_inc(&entry->refs);
 	}
-	spin_unlock(&tree->lock);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	return ret == 0;
 }
 
@@ -338,15 +376,21 @@ out:
  */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 file_offset, u64 io_size)
+				   u64 file_offset, u64 io_size, int uptodate)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
+	unsigned long flags;
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irqsave(&tree->lock, flags);
+	if (cached && *cached) {
+		entry = *cached;
+		goto have_entry;
+	}
+
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		ret = 1;
@@ -354,71 +398,178 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	}
 
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
 	if (!offset_in_entry(entry, file_offset)) {
 		ret = 1;
 		goto out;
 	}
 
 	if (io_size > entry->bytes_left) {
-		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       (unsigned long long)entry->bytes_left,
-		       (unsigned long long)io_size);
+		btrfs_crit(BTRFS_I(inode)->root->fs_info,
+			   "bad ordered accounting left %llu size %llu",
+		       entry->bytes_left, io_size);
 	}
 	entry->bytes_left -= io_size;
-	if (entry->bytes_left == 0)
+	if (!uptodate)
+		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-	else
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
 		ret = 1;
+	}
 out:
 	if (!ret && cached && entry) {
 		*cached = entry;
 		atomic_inc(&entry->refs);
 	}
-	spin_unlock(&tree->lock);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	return ret == 0;
 }
 
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	struct rb_node *n;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		if (!list_empty(&ordered->log_list))
+			continue;
+		list_add_tail(&ordered->log_list, logged_list);
+		atomic_inc(&ordered->refs);
+	}
+	spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	while (!list_empty(logged_list)) {
+		ordered = list_first_entry(logged_list,
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		btrfs_put_ordered_extent(ordered);
+	}
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log)
+{
+	int index = log->log_transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	list_splice_tail(logged_list, &log->logged_list[index]);
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+
+		if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+			struct inode *inode = ordered->inode;
+			u64 start = ordered->file_offset;
+			u64 end = ordered->file_offset + ordered->len - 1;
+
+			WARN_ON(!inode);
+			filemap_fdatawrite_range(inode->i_mapping, start, end);
+		}
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+						   &ordered->flags));
+
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
 /*
  * used to drop a reference on an ordered extent.  This will free
  * the extent if the last reference is dropped
  */
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 {
 	struct list_head *cur;
 	struct btrfs_ordered_sum *sum;
 
+	trace_btrfs_ordered_extent_put(entry->inode, entry);
+
 	if (atomic_dec_and_test(&entry->refs)) {
+		if (entry->inode)
+			btrfs_add_delayed_iput(entry->inode);
 		while (!list_empty(&entry->list)) {
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
 			list_del(&sum->list);
 			kfree(sum);
 		}
-		kfree(entry);
+		kmem_cache_free(btrfs_ordered_extent_cache, entry);
 	}
-	return 0;
 }
 
 /*
  * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
  */
-static int __btrfs_remove_ordered_extent(struct inode *inode,
-				struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+				 struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct rb_node *node;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
-	tree->last = NULL;
+	if (tree->last == node)
+		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	spin_unlock_irq(&tree->lock);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
+	root->nr_ordered_extents--;
+
+	trace_btrfs_ordered_extent_remove(inode, entry);
 
 	/*
 	 * we have no more ordered extents for this inode and
@@ -427,86 +578,112 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
 	 */
 	if (RB_EMPTY_ROOT(&tree->tree) &&
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		spin_lock(&root->fs_info->ordered_root_lock);
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
 
-	return 0;
+	if (!root->nr_ordered_extents) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(list_empty(&root->ordered_root));
+		list_del_init(&root->ordered_root);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
+	wake_up(&entry->wait);
 }
 
-/*
- * remove an ordered extent from the tree.  No references are dropped
- * but any waiters are woken.
- */
-int btrfs_remove_ordered_extent(struct inode *inode,
-				struct btrfs_ordered_extent *entry)
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 {
-	struct btrfs_ordered_inode_tree *tree;
-	int ret;
-
-	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
-	ret = __btrfs_remove_ordered_extent(inode, entry);
-	spin_unlock(&tree->lock);
-	wake_up(&entry->wait);
+	struct btrfs_ordered_extent *ordered;
 
-	return ret;
+	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+	complete(&ordered->completion);
 }
 
 /*
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
-			       int nocow_only, int delay_iput)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 {
-	struct list_head splice;
-	struct list_head *cur;
-	struct btrfs_ordered_extent *ordered;
-	struct inode *inode;
+	struct list_head splice, works;
+	struct btrfs_ordered_extent *ordered, *next;
+	int count = 0;
 
 	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
+
+	mutex_lock(&root->ordered_extent_mutex);
+	spin_lock(&root->ordered_extent_lock);
+	list_splice_init(&root->ordered_extents, &splice);
+	while (!list_empty(&splice) && nr) {
+		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+					   root_extent_list);
+		list_move_tail(&ordered->root_extent_list,
+			       &root->ordered_extents);
+		atomic_inc(&ordered->refs);
+		spin_unlock(&root->ordered_extent_lock);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
-	list_splice_init(&root->fs_info->ordered_extents, &splice);
-	while (!list_empty(&splice)) {
-		cur = splice.next;
-		ordered = list_entry(cur, struct btrfs_ordered_extent,
-				     root_extent_list);
-		if (nocow_only &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
-		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-			list_move(&ordered->root_extent_list,
-				  &root->fs_info->ordered_extents);
-			cond_resched_lock(&root->fs_info->ordered_extent_lock);
-			continue;
-		}
+		btrfs_init_work(&ordered->flush_work,
+				btrfs_run_ordered_extent_work, NULL, NULL);
+		list_add_tail(&ordered->work_list, &works);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &ordered->flush_work);
 
-		list_del_init(&ordered->root_extent_list);
-		atomic_inc(&ordered->refs);
+		cond_resched();
+		spin_lock(&root->ordered_extent_lock);
+		if (nr != -1)
+			nr--;
+		count++;
+	}
+	list_splice_tail(&splice, &root->ordered_extents);
+	spin_unlock(&root->ordered_extent_lock);
 
-		/*
-		 * the inode may be getting freed (in sys_unlink path).
-		 */
-		inode = igrab(ordered->inode);
+	list_for_each_entry_safe(ordered, next, &works, work_list) {
+		list_del_init(&ordered->work_list);
+		wait_for_completion(&ordered->completion);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+	mutex_unlock(&root->ordered_extent_mutex);
 
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+	return count;
+}
 
-		if (inode) {
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			if (delay_iput)
-				btrfs_add_delayed_iput(inode);
-			else
-				iput(inode);
-		} else {
-			btrfs_put_ordered_extent(ordered);
-		}
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+	int done;
+
+	INIT_LIST_HEAD(&splice);
 
-		spin_lock(&root->fs_info->ordered_extent_lock);
+	mutex_lock(&fs_info->ordered_operations_mutex);
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
+	while (!list_empty(&splice) && nr) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
+		spin_unlock(&fs_info->ordered_root_lock);
+
+		done = btrfs_wait_ordered_extents(root, nr);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->ordered_root_lock);
+		if (nr != -1) {
+			nr -= done;
+			WARN_ON(nr < 0);
+		}
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-	return 0;
+	list_splice_tail(&splice, &fs_info->ordered_roots);
+	spin_unlock(&fs_info->ordered_root_lock);
+	mutex_unlock(&fs_info->ordered_operations_mutex);
 }
 
 /*
@@ -519,23 +696,26 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root,
  * extra check to make sure the ordered operation list really is empty
  * before we return
  */
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, int wait)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct inode *inode;
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct list_head splice;
+	struct list_head works;
+	struct btrfs_delalloc_work *work, *next;
+	int ret = 0;
 
 	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
 
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
-again:
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
-
+	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
+	spin_lock(&root->fs_info->ordered_root_lock);
+	list_splice_init(&cur_trans->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 				   ordered_operations);
-
 		inode = &btrfs_inode->vfs_inode;
 
 		list_del_init(&btrfs_inode->ordered_operations);
@@ -544,31 +724,41 @@ again:
 		 * the inode may be getting freed (in sys_unlink path).
 		 */
 		inode = igrab(inode);
+		if (!inode)
+			continue;
 
-		if (!wait && inode) {
+		if (!wait)
 			list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
-		}
-		spin_unlock(&root->fs_info->ordered_extent_lock);
-
-		if (inode) {
-			if (wait)
-				btrfs_wait_ordered_range(inode, 0, (u64)-1);
-			else
-				filemap_flush(inode->i_mapping);
-			btrfs_add_delayed_iput(inode);
+				      &cur_trans->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+
+		work = btrfs_alloc_delalloc_work(inode, wait, 1);
+		if (!work) {
+			spin_lock(&root->fs_info->ordered_root_lock);
+			if (list_empty(&BTRFS_I(inode)->ordered_operations))
+				list_add_tail(&btrfs_inode->ordered_operations,
+					      &splice);
+			list_splice_tail(&splice,
+					 &cur_trans->ordered_operations);
+			spin_unlock(&root->fs_info->ordered_root_lock);
+			ret = -ENOMEM;
+			goto out;
 		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
 
 		cond_resched();
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->fs_info->ordered_root_lock);
 	}
-	if (wait && !list_empty(&root->fs_info->ordered_operations))
-		goto again;
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-	mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
-	return 0;
+	spin_unlock(&root->fs_info->ordered_root_lock);
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
+	return ret;
 }
 
 /*
@@ -585,10 +775,12 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	u64 start = entry->file_offset;
 	u64 end = start + entry->len - 1;
 
+	trace_btrfs_ordered_extent_start(inode, entry);
+
 	/*
 	 * pages in the range can be dirty, clean or writeback.  We
 	 * start IO on any dirty ones so the wait doesn't stall waiting
-	 * for pdflush to find them
+	 * for the flusher thread to find them
 	 */
 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
 		filemap_fdatawrite_range(inode->i_mapping, start, end);
@@ -603,10 +795,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
  */
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
+	int ret = 0;
 	u64 end;
 	u64 orig_end;
 	struct btrfs_ordered_extent *ordered;
-	int found;
 
 	if (start + len < start) {
 		orig_end = INT_LIMIT(loff_t);
@@ -615,22 +807,39 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		if (orig_end > INT_LIMIT(loff_t))
 			orig_end = INT_LIMIT(loff_t);
 	}
-again:
+
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-	/* The compression code will leave pages locked but return from
-	 * writepage without setting the page writeback.  Starting again
-	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+	if (ret)
+		return ret;
+	/*
+	 * So with compression we will find and lock a dirty page and clear the
+	 * first one as dirty, setup an async extent, and immediately return
+	 * with the entire range locked but with nobody actually marked with
+	 * writeback.  So we can't just filemap_write_and_wait_range() and
+	 * expect it to work since it will just kick off a thread to do the
+	 * actual work.  So we need to call filemap_fdatawrite_range _again_
+	 * since it will wait on the page lock, which won't be unlocked until
+	 * after the pages have been marked as writeback and so we're good to go
+	 * from there.  We have to do this otherwise we'll miss the ordered
+	 * extents and that results in badness.  Please Josef, do not think you
+	 * know better and pull this out at some point in the future, it is
+	 * right and you are wrong.
 	 */
-	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		ret = filemap_fdatawrite_range(inode->i_mapping, start,
+					       orig_end);
+		if (ret)
+			return ret;
+	}
+	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+	if (ret)
+		return ret;
 
 	end = orig_end;
-	found = 0;
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered)
@@ -639,24 +848,20 @@ again:
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		if (ordered->file_offset + ordered->len < start) {
+		if (ordered->file_offset + ordered->len <= start) {
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		found++;
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
+		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+			ret = -EIO;
 		btrfs_put_ordered_extent(ordered);
-		if (end == 0 || end == start)
+		if (ret || end == 0 || end == start)
 			break;
 		end--;
 	}
-	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-			   EXTENT_DELALLOC, 0, NULL)) {
-		schedule_timeout(1);
-		goto again;
-	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -671,7 +876,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -682,7 +887,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 	if (entry)
 		atomic_inc(&entry->refs);
 out:
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return entry;
 }
 
@@ -698,7 +903,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		node = tree_search(tree, file_offset + len);
@@ -723,7 +928,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 out:
 	if (entry)
 		atomic_inc(&entry->refs);
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return entry;
 }
 
@@ -739,7 +944,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -747,7 +952,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 	atomic_inc(&entry->refs);
 out:
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return entry;
 }
 
@@ -759,22 +964,24 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered)
 {
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	u64 disk_i_size;
 	u64 new_i_size;
-	u64 i_size_test;
 	u64 i_size = i_size_read(inode);
 	struct rb_node *node;
 	struct rb_node *prev = NULL;
 	struct btrfs_ordered_extent *test;
 	int ret = 1;
 
-	if (ordered)
+	spin_lock_irq(&tree->lock);
+	if (ordered) {
 		offset = entry_end(ordered);
-	else
+		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
+			offset = min(offset,
+				     ordered->file_offset +
+				     ordered->truncated_len);
+	} else {
 		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
-
-	spin_lock(&tree->lock);
+	}
 	disk_i_size = BTRFS_I(inode)->disk_i_size;
 
 	/* truncate file */
@@ -788,18 +995,17 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	 * if the disk i_size is already at the inode->i_size, or
 	 * this ordered extent is inside the disk i_size, we're done
 	 */
-	if (disk_i_size == i_size || offset <= disk_i_size) {
+	if (disk_i_size == i_size)
 		goto out;
-	}
 
 	/*
-	 * we can't update the disk_isize if there are delalloc bytes
-	 * between disk_i_size and  this ordered extent
+	 * We still need to update disk_i_size if outstanding_isize is greater
+	 * than disk_i_size.
 	 */
-	if (test_range_bit(io_tree, disk_i_size, offset - 1,
-			   EXTENT_DELALLOC, 0, NULL)) {
+	if (offset <= disk_i_size &&
+	    (!ordered || ordered->outstanding_isize <= disk_i_size))
 		goto out;
-	}
+
 	/*
 	 * walk backward from this ordered extent to disk_i_size.
 	 * if we find an ordered extent then we can't update disk i_size
@@ -820,69 +1026,53 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 		}
 		node = prev;
 	}
-	while (node) {
+	for (; node; node = rb_prev(node)) {
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+		/* We treat this entry as if it doesnt exist */
+		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+			continue;
 		if (test->file_offset + test->len <= disk_i_size)
 			break;
 		if (test->file_offset >= i_size)
 			break;
-		if (test->file_offset >= disk_i_size)
+		if (entry_end(test) > disk_i_size) {
+			/*
+			 * we don't update disk_i_size now, so record this
+			 * undealt i_size. Or we will not know the real
+			 * i_size.
+			 */
+			if (test->outstanding_isize < offset)
+				test->outstanding_isize = offset;
+			if (ordered &&
+			    ordered->outstanding_isize >
+			    test->outstanding_isize)
+				test->outstanding_isize =
+						ordered->outstanding_isize;
 			goto out;
-		node = rb_prev(node);
+		}
 	}
 	new_i_size = min_t(u64, offset, i_size);
 
 	/*
-	 * at this point, we know we can safely update i_size to at least
-	 * the offset from this ordered extent.  But, we need to
-	 * walk forward and see if ios from higher up in the file have
-	 * finished.
+	 * Some ordered extents may completed before the current one, and
+	 * we hold the real i_size in ->outstanding_isize.
 	 */
-	if (ordered) {
-		node = rb_next(&ordered->rb_node);
-	} else {
-		if (prev)
-			node = rb_next(prev);
-		else
-			node = rb_first(&tree->tree);
-	}
-	i_size_test = 0;
-	if (node) {
-		/*
-		 * do we have an area where IO might have finished
-		 * between our ordered extent and the next one.
-		 */
-		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (test->file_offset > offset)
-			i_size_test = test->file_offset;
-	} else {
-		i_size_test = i_size;
-	}
-
-	/*
-	 * i_size_test is the end of a region after this ordered
-	 * extent where there are no ordered extents.  As long as there
-	 * are no delalloc bytes in this area, it is safe to update
-	 * disk_i_size to the end of the region.
-	 */
-	if (i_size_test > offset &&
-	    !test_range_bit(io_tree, offset, i_size_test - 1,
-			    EXTENT_DELALLOC, 0, NULL)) {
-		new_i_size = min_t(u64, i_size_test, i_size);
-	}
+	if (ordered && ordered->outstanding_isize > new_i_size)
+		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
 	BTRFS_I(inode)->disk_i_size = new_i_size;
 	ret = 0;
 out:
 	/*
-	 * we need to remove the ordered extent with the tree lock held
-	 * so that other people calling this function don't find our fully
-	 * processed ordered entry and skip updating the i_size
+	 * We need to do this because we can't remove ordered extents until
+	 * after the i_disk_size has been updated and then the inode has been
+	 * updated to reflect the change, so we need to tell anybody who finds
+	 * this ordered extent that we've already done all the real work, we
+	 * just haven't completed all the other work.
 	 */
 	if (ordered)
-		__btrfs_remove_ordered_extent(inode, ordered);
-	spin_unlock(&tree->lock);
-	if (ordered)
-		wake_up(&ordered->wait);
+		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+	spin_unlock_irq(&tree->lock);
 	return ret;
 }
 
@@ -892,39 +1082,42 @@ out:
  * be reclaimed before their checksum is actually put into the btree
  */
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
-			   u32 *sum)
+			   u32 *sum, int len)
 {
 	struct btrfs_ordered_sum *ordered_sum;
-	struct btrfs_sector_sum *sector_sums;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
-	int ret = 1;
+	int index = 0;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	if (!ordered)
-		return 1;
+		return 0;
 
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
-		if (disk_bytenr >= ordered_sum->bytenr) {
-			num_sectors = ordered_sum->len / sectorsize;
-			sector_sums = ordered_sum->sums;
-			for (i = 0; i < num_sectors; i++) {
-				if (sector_sums[i].bytenr == disk_bytenr) {
-					*sum = sector_sums[i].sum;
-					ret = 0;
-					goto out;
-				}
-			}
+		if (disk_bytenr >= ordered_sum->bytenr &&
+		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
+			i = (disk_bytenr - ordered_sum->bytenr) >>
+			    inode->i_sb->s_blocksize_bits;
+			num_sectors = ordered_sum->len >>
+				      inode->i_sb->s_blocksize_bits;
+			num_sectors = min_t(int, len - index, num_sectors - i);
+			memcpy(sum + index, ordered_sum->sums + i,
+			       num_sectors);
+
+			index += (int)num_sectors;
+			if (index == len)
+				goto out;
+			disk_bytenr += num_sectors * sectorsize;
 		}
 	}
 out:
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	btrfs_put_ordered_extent(ordered);
-	return ret;
+	return index;
 }
 
 
@@ -940,10 +1133,10 @@ out:
  * If trans is not null, we'll do a friendly check for a transaction that
  * is already flushing things and force the IO down ourselves.
  */
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct inode *inode)
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, struct inode *inode)
 {
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	u64 last_mod;
 
 	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -952,24 +1145,31 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	 * if this file hasn't been changed since the last transaction
 	 * commit, we can safely return without doing anything
 	 */
-	if (last_mod < root->fs_info->last_trans_committed)
-		return 0;
+	if (last_mod <= root->fs_info->last_trans_committed)
+		return;
 
-	/*
-	 * the transaction is already committing.  Just start the IO and
-	 * don't bother with all of this list nonsense
-	 */
-	if (trans && root->fs_info->running_transaction->blocked) {
-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
-		return 0;
-	}
-
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_root_lock);
 	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
 		list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
+			      &cur_trans->ordered_operations);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
+}
+
+int __init ordered_data_init(void)
+{
+	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+				     sizeof(struct btrfs_ordered_extent), 0,
+				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				     NULL);
+	if (!btrfs_ordered_extent_cache)
+		return -ENOMEM;
 
 	return 0;
 }
+
+void ordered_data_exit(void)
+{
+	if (btrfs_ordered_extent_cache)
+		kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ff1f69aa188..246897058ef 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
 	struct rb_node *last;
 };
 
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
-	/* bytenr on disk */
-	u64 bytenr;
-	u32 sum;
-};
-
 struct btrfs_ordered_sum {
 	/* bytenr is the start of this extent on disk */
 	u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
 	/*
 	 * this is the length in bytes covered by the sums array below.
 	 */
-	unsigned long len;
+	int len;
 	struct list_head list;
-	/* last field is a variable length array of btrfs_sector_sums */
-	struct btrfs_sector_sum sums[];
+	/* last field is a variable length array of csums */
+	u32 sums[];
 };
 
 /*
@@ -74,6 +62,15 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
+				       * has done its due diligence in updating
+				       * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+				       ordered extent */
+#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -90,6 +87,22 @@ struct btrfs_ordered_extent {
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
+	/* number of bytes that still need csumming */
+	u64 csum_bytes_left;
+
+	/*
+	 * the end of the ordered extent which is behind it but
+	 * didn't update disk_i_size. Please see the comment of
+	 * btrfs_ordered_update_i_size();
+	 */
+	u64 outstanding_isize;
+
+	/*
+	 * If we get truncated we need to adjust the file extent we enter for
+	 * this ordered extent so that we do not expose stale data.
+	 */
+	u64 truncated_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
@@ -105,6 +118,9 @@ struct btrfs_ordered_extent {
 	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
 
+	/* If we need to wait on this to be done */
+	struct list_head log_list;
+
 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
 
@@ -113,8 +129,13 @@ struct btrfs_ordered_extent {
 
 	/* a per root list of all the pending ordered extents */
 	struct list_head root_extent_list;
-};
 
+	struct btrfs_work work;
+
+	struct completion completion;
+	struct btrfs_work flush_work;
+	struct list_head work_list;
+};
 
 /*
  * calculates the total size you need to allocate for an ordered sum
@@ -123,11 +144,8 @@ struct btrfs_ordered_extent {
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 					 unsigned long bytes)
 {
-	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
-		root->sectorsize;
-	num_sectors++;
-	return sizeof(struct btrfs_ordered_sum) +
-		num_sectors * sizeof(struct btrfs_sector_sum);
+	int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+	return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
 }
 
 static inline void
@@ -138,15 +156,16 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 	t->last = NULL;
 }
 
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 file_offset, u64 io_size);
+				   u64 file_offset, u64 io_size, int uptodate);
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 *file_offset, u64 io_size);
+				   u64 *file_offset, u64 io_size,
+				   int uptodate);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
@@ -154,9 +173,9 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
 				      int type, int compress_type);
-int btrfs_add_ordered_sum(struct inode *inode,
-			  struct btrfs_ordered_extent *entry,
-			  struct btrfs_ordered_sum *sum);
+void btrfs_add_ordered_sum(struct inode *inode,
+			   struct btrfs_ordered_extent *entry,
+			   struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
@@ -169,11 +188,22 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 							u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct inode *inode);
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
-			       int nocow_only, int delay_iput);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum, int len);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, int wait);
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index f8be250963a..65793edb38c 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -58,7 +58,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-	if (ret) {
+	if (ret) { /* JDM: Really? */
 		ret = -ENOENT;
 		goto out;
 	}
@@ -69,23 +69,3 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
-
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
-{
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	int ret;
-
-	key.objectid = BTRFS_ORPHAN_OBJECTID;
-	key.type = BTRFS_ORPHAN_ITEM_KEY;
-	key.offset = offset;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
-	btrfs_free_path(path);
-	return ret;
-}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e..9626b4ad3b9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -26,14 +26,12 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 	int i;
 	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
 	       "num_stripes %d\n",
-	       (unsigned long long)btrfs_chunk_length(eb, chunk),
-	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
-	       (unsigned long long)btrfs_chunk_type(eb, chunk),
-	       num_stripes);
+	       btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk),
+	       btrfs_chunk_type(eb, chunk), num_stripes);
 	for (i = 0 ; i < num_stripes ; i++) {
 		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
-		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
-		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+		      btrfs_stripe_devid_nr(eb, chunk, i),
+		      btrfs_stripe_offset_nr(eb, chunk, i));
 	}
 }
 static void print_dev_item(struct extent_buffer *eb,
@@ -41,22 +39,22 @@ static void print_dev_item(struct extent_buffer *eb,
 {
 	printk(KERN_INFO "\t\tdev item devid %llu "
 	       "total_bytes %llu bytes used %llu\n",
-	       (unsigned long long)btrfs_device_id(eb, dev_item),
-	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
-	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+	       btrfs_device_id(eb, dev_item),
+	       btrfs_device_total_bytes(eb, dev_item),
+	       btrfs_device_bytes_used(eb, dev_item));
 }
 static void print_extent_data_ref(struct extent_buffer *eb,
 				  struct btrfs_extent_data_ref *ref)
 {
 	printk(KERN_INFO "\t\textent data backref root %llu "
 	       "objectid %llu offset %llu count %u\n",
-	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_root(eb, ref),
+	       btrfs_extent_data_ref_objectid(eb, ref),
+	       btrfs_extent_data_ref_offset(eb, ref),
 	       btrfs_extent_data_ref_count(eb, ref));
 }
 
-static void print_extent_item(struct extent_buffer *eb, int slot)
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
 {
 	struct btrfs_extent_item *ei;
 	struct btrfs_extent_inline_ref *iref;
@@ -65,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 	struct btrfs_disk_key key;
 	unsigned long end;
 	unsigned long ptr;
-	int type;
 	u32 item_size = btrfs_item_size_nr(eb, slot);
 	u64 flags;
 	u64 offset;
@@ -87,19 +84,18 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 	flags = btrfs_extent_flags(eb, ei);
 
 	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
-	       (unsigned long long)btrfs_extent_refs(eb, ei),
-	       (unsigned long long)btrfs_extent_generation(eb, ei),
-	       (unsigned long long)flags);
+	       btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
+	       flags);
 
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		struct btrfs_tree_block_info *info;
 		info = (struct btrfs_tree_block_info *)(ei + 1);
 		btrfs_tree_block_key(eb, info, &key);
-		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+		printk(KERN_INFO "\t\ttree block key (%llu %u %llu) "
 		       "level %d\n",
-		       (unsigned long long)btrfs_disk_key_objectid(&key),
-		       key.type,
-		       (unsigned long long)btrfs_disk_key_offset(&key),
+		       btrfs_disk_key_objectid(&key), key.type,
+		       btrfs_disk_key_offset(&key),
 		       btrfs_tree_block_level(eb, info));
 		iref = (struct btrfs_extent_inline_ref *)(info + 1);
 	} else {
@@ -115,11 +111,11 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 		switch (type) {
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\ttree block backref "
-				"root %llu\n", (unsigned long long)offset);
+				"root %llu\n", offset);
 			break;
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\tshared block backref "
-				"parent %llu\n", (unsigned long long)offset);
+				"parent %llu\n", offset);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -129,8 +125,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 			sref = (struct btrfs_shared_data_ref *)(iref + 1);
 			printk(KERN_INFO "\t\tshared data backref "
 			       "parent %llu count %u\n",
-			       (unsigned long long)offset,
-			       btrfs_shared_data_ref_count(eb, sref));
+			       offset, btrfs_shared_data_ref_count(eb, sref));
 			break;
 		default:
 			BUG();
@@ -148,18 +143,36 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
 	printk("\t\textent back ref root %llu gen %llu "
 		"owner %llu num_refs %lu\n",
-		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+		btrfs_ref_root_v0(eb, ref0),
+		btrfs_ref_generation_v0(eb, ref0),
+		btrfs_ref_objectid_v0(eb, ref0),
 		(unsigned long)btrfs_ref_count_v0(eb, ref0));
 }
 #endif
 
+static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+			    u32 item_size)
+{
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+			(unsigned long)item_size);
+		return;
+	}
+	while (item_size) {
+		__le64 subvol_id;
+
+		read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
+		printk(KERN_INFO "\t\tsubvol_id %llu\n",
+		       (unsigned long long)le64_to_cpu(subvol_id));
+		item_size -= sizeof(u64);
+		offset += sizeof(u64);
+	}
+}
+
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
-	u32 type;
-	u32 nr = btrfs_header_nritems(l);
+	u32 type, nr;
 	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
@@ -172,45 +185,46 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 
-	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
-		(unsigned long long)btrfs_header_bytenr(l), nr,
-		btrfs_leaf_free_space(root, l));
+	if (!l)
+		return;
+
+	nr = btrfs_header_nritems(l);
+
+	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
+		   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
-		item = btrfs_item_nr(l, i);
+		item = btrfs_item_nr(i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
-		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
 		       "itemsize %d\n",
-			i,
-			(unsigned long long)key.objectid, type,
-			(unsigned long long)key.offset,
+			i, key.objectid, type, key.offset,
 			btrfs_item_offset(l, item), btrfs_item_size(l, item));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
 			printk(KERN_INFO "\t\tinode generation %llu size %llu "
 			       "mode %o\n",
-			       (unsigned long long)
 			       btrfs_inode_generation(l, ii),
-			      (unsigned long long)btrfs_inode_size(l, ii),
+			       btrfs_inode_size(l, ii),
 			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
 			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
-				(unsigned long long)found_key.objectid,
+				found_key.objectid,
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
 			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
-				(unsigned long long)
 				btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
-			print_extent_item(l, i);
+		case BTRFS_METADATA_ITEM_KEY:
+			print_extent_item(l, i, type);
 			break;
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\ttree block backref\n");
@@ -236,22 +250,17 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			    BTRFS_FILE_EXTENT_INLINE) {
 				printk(KERN_INFO "\t\tinline extent data "
 				       "size %u\n",
-				       btrfs_file_extent_inline_len(l, fi));
+				       btrfs_file_extent_inline_len(l, i, fi));
 				break;
 			}
 			printk(KERN_INFO "\t\textent data disk bytenr %llu "
 			       "nr %llu\n",
-			       (unsigned long long)
 			       btrfs_file_extent_disk_bytenr(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_disk_num_bytes(l, fi));
 			printk(KERN_INFO "\t\textent data offset %llu "
 			       "nr %llu ram %llu\n",
-			       (unsigned long long)
 			       btrfs_file_extent_offset(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_num_bytes(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_EXTENT_REF_V0_KEY:
@@ -265,7 +274,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
 			printk(KERN_INFO "\t\tblock group used %llu\n",
-			       (unsigned long long)
 			       btrfs_disk_block_group_used(l, bi));
 			break;
 		case BTRFS_CHUNK_ITEM_KEY:
@@ -282,14 +290,22 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
 			       "\t\tchunk objectid %llu chunk offset %llu "
 			       "length %llu\n",
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_tree(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_offset(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_length(l, dev_extent));
+			break;
+		case BTRFS_DEV_STATS_KEY:
+			printk(KERN_INFO "\t\tdevice stats\n");
+			break;
+		case BTRFS_DEV_REPLACE_KEY:
+			printk(KERN_INFO "\t\tdev replace\n");
+			break;
+		case BTRFS_UUID_KEY_SUBVOL:
+		case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+					btrfs_item_size_nr(l, i));
+			break;
 		};
 	}
 }
@@ -308,18 +324,14 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		btrfs_print_leaf(root, c);
 		return;
 	}
-	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
-	       (unsigned long long)btrfs_header_bytenr(c),
-	      level, nr,
-	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+	btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
+		btrfs_header_bytenr(c), level, nr,
+		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
 		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
-		       i,
-		       (unsigned long long)key.objectid,
-		       key.type,
-		       (unsigned long long)key.offset,
-		       (unsigned long long)btrfs_node_blockptr(c, i));
+		       i, key.objectid, key.type, key.offset,
+		       btrfs_node_blockptr(c, i));
 	}
 	for (i = 0; i < nr; i++) {
 		struct extent_buffer *next = read_tree_block(root,
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index da75efe534d..7faddfacc5b 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -19,5 +19,5 @@
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
-void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
 #endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 00000000000..129b1dd2852
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "hash.h"
+#include "transaction.h"
+#include "xattr.h"
+
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+
+struct prop_handler {
+	struct hlist_node node;
+	const char *xattr_name;
+	int (*validate)(const char *value, size_t len);
+	int (*apply)(struct inode *inode, const char *value, size_t len);
+	const char *(*extract)(struct inode *inode);
+	int inheritable;
+};
+
+static int prop_compression_validate(const char *value, size_t len);
+static int prop_compression_apply(struct inode *inode,
+				  const char *value,
+				  size_t len);
+static const char *prop_compression_extract(struct inode *inode);
+
+static struct prop_handler prop_handlers[] = {
+	{
+		.xattr_name = XATTR_BTRFS_PREFIX "compression",
+		.validate = prop_compression_validate,
+		.apply = prop_compression_apply,
+		.extract = prop_compression_extract,
+		.inheritable = 1
+	},
+	{
+		.xattr_name = NULL
+	}
+};
+
+void __init btrfs_props_init(void)
+{
+	struct prop_handler *p;
+
+	hash_init(prop_handlers_ht);
+
+	for (p = &prop_handlers[0]; p->xattr_name; p++) {
+		u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+
+		hash_add(prop_handlers_ht, &p->node, h);
+	}
+}
+
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+	struct hlist_head *h;
+
+	h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+	if (hlist_empty(h))
+		return NULL;
+
+	return h;
+}
+
+static const struct prop_handler *
+find_prop_handler(const char *name,
+		  const struct hlist_head *handlers)
+{
+	struct prop_handler *h;
+
+	if (!handlers) {
+		u64 hash = btrfs_name_hash(name, strlen(name));
+
+		handlers = find_prop_handlers_by_hash(hash);
+		if (!handlers)
+			return NULL;
+	}
+
+	hlist_for_each_entry(h, handlers, node)
+		if (!strcmp(h->xattr_name, name))
+			return h;
+
+	return NULL;
+}
+
+static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
+			    struct inode *inode,
+			    const char *name,
+			    const char *value,
+			    size_t value_len,
+			    int flags)
+{
+	const struct prop_handler *handler;
+	int ret;
+
+	if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+		return -EINVAL;
+
+	handler = find_prop_handler(name, NULL);
+	if (!handler)
+		return -EINVAL;
+
+	if (value_len == 0) {
+		ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+				       NULL, 0, flags);
+		if (ret)
+			return ret;
+
+		ret = handler->apply(inode, NULL, 0);
+		ASSERT(ret == 0);
+
+		return ret;
+	}
+
+	ret = handler->validate(value, value_len);
+	if (ret)
+		return ret;
+	ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+			       value, value_len, flags);
+	if (ret)
+		return ret;
+	ret = handler->apply(inode, value, value_len);
+	if (ret) {
+		__btrfs_setxattr(trans, inode, handler->xattr_name,
+				 NULL, 0, flags);
+		return ret;
+	}
+
+	set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+
+	return 0;
+}
+
+int btrfs_set_prop(struct inode *inode,
+		   const char *name,
+		   const char *value,
+		   size_t value_len,
+		   int flags)
+{
+	return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
+}
+
+static int iterate_object_props(struct btrfs_root *root,
+				struct btrfs_path *path,
+				u64 objectid,
+				void (*iterator)(void *,
+						 const struct prop_handler *,
+						 const char *,
+						 size_t),
+				void *ctx)
+{
+	int ret;
+	char *name_buf = NULL;
+	char *value_buf = NULL;
+	int name_buf_len = 0;
+	int value_buf_len = 0;
+
+	while (1) {
+		struct btrfs_key key;
+		struct btrfs_dir_item *di;
+		struct extent_buffer *leaf;
+		u32 total_len, cur, this_len;
+		int slot;
+		const struct hlist_head *handlers;
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != objectid)
+			break;
+		if (key.type != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		handlers = find_prop_handlers_by_hash(key.offset);
+		if (!handlers)
+			goto next_slot;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		cur = 0;
+		total_len = btrfs_item_size_nr(leaf, slot);
+
+		while (cur < total_len) {
+			u32 name_len = btrfs_dir_name_len(leaf, di);
+			u32 data_len = btrfs_dir_data_len(leaf, di);
+			unsigned long name_ptr, data_ptr;
+			const struct prop_handler *handler;
+
+			this_len = sizeof(*di) + name_len + data_len;
+			name_ptr = (unsigned long)(di + 1);
+			data_ptr = name_ptr + name_len;
+
+			if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+			    memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+						 name_ptr,
+						 XATTR_BTRFS_PREFIX_LEN))
+				goto next_dir_item;
+
+			if (name_len >= name_buf_len) {
+				kfree(name_buf);
+				name_buf_len = name_len + 1;
+				name_buf = kmalloc(name_buf_len, GFP_NOFS);
+				if (!name_buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+			name_buf[name_len] = '\0';
+
+			handler = find_prop_handler(name_buf, handlers);
+			if (!handler)
+				goto next_dir_item;
+
+			if (data_len > value_buf_len) {
+				kfree(value_buf);
+				value_buf_len = data_len;
+				value_buf = kmalloc(data_len, GFP_NOFS);
+				if (!value_buf) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+
+			iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+			cur += this_len;
+			di = (struct btrfs_dir_item *)((char *) di + this_len);
+		}
+
+next_slot:
+		path->slots[0]++;
+	}
+
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	kfree(name_buf);
+	kfree(value_buf);
+
+	return ret;
+}
+
+static void inode_prop_iterator(void *ctx,
+				const struct prop_handler *handler,
+				const char *value,
+				size_t len)
+{
+	struct inode *inode = ctx;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = handler->apply(inode, value, len);
+	if (unlikely(ret))
+		btrfs_warn(root->fs_info,
+			   "error applying prop %s to ino %llu (root %llu): %d",
+			   handler->xattr_name, btrfs_ino(inode),
+			   root->root_key.objectid, ret);
+	else
+		set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+	ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+
+	return ret;
+}
+
+static int inherit_props(struct btrfs_trans_handle *trans,
+			 struct inode *inode,
+			 struct inode *parent)
+{
+	const struct prop_handler *h;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!test_bit(BTRFS_INODE_HAS_PROPS,
+		      &BTRFS_I(parent)->runtime_flags))
+		return 0;
+
+	for (h = &prop_handlers[0]; h->xattr_name; h++) {
+		const char *value;
+		u64 num_bytes;
+
+		if (!h->inheritable)
+			continue;
+
+		value = h->extract(parent);
+		if (!value)
+			continue;
+
+		num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+		ret = btrfs_block_rsv_add(root, trans->block_rsv,
+					  num_bytes, BTRFS_RESERVE_NO_FLUSH);
+		if (ret)
+			goto out;
+		ret = __btrfs_set_prop(trans, inode, h->xattr_name,
+				       value, strlen(value), 0);
+		btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+			      struct inode *inode,
+			      struct inode *dir)
+{
+	if (!dir)
+		return 0;
+
+	return inherit_props(trans, inode, dir);
+}
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_root *parent_root)
+{
+	struct btrfs_key key;
+	struct inode *parent_inode, *child_inode;
+	int ret;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
+				  parent_root, NULL);
+	if (IS_ERR(parent_inode))
+		return PTR_ERR(parent_inode);
+
+	child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+	if (IS_ERR(child_inode)) {
+		iput(parent_inode);
+		return PTR_ERR(child_inode);
+	}
+
+	ret = inherit_props(trans, child_inode, parent_inode);
+	iput(child_inode);
+	iput(parent_inode);
+
+	return ret;
+}
+
+static int prop_compression_validate(const char *value, size_t len)
+{
+	if (!strncmp("lzo", value, len))
+		return 0;
+	else if (!strncmp("zlib", value, len))
+		return 0;
+
+	return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode,
+				  const char *value,
+				  size_t len)
+{
+	int type;
+
+	if (len == 0) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+
+		return 0;
+	}
+
+	if (!strncmp("lzo", value, len))
+		type = BTRFS_COMPRESS_LZO;
+	else if (!strncmp("zlib", value, len))
+		type = BTRFS_COMPRESS_ZLIB;
+	else
+		return -EINVAL;
+
+	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	BTRFS_I(inode)->force_compress = type;
+
+	return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+	switch (BTRFS_I(inode)->force_compress) {
+	case BTRFS_COMPRESS_ZLIB:
+		return "zlib";
+	case BTRFS_COMPRESS_LZO:
+		return "lzo";
+	}
+
+	return NULL;
+}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 00000000000..100f18829d5
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_PROPS_H
+#define __BTRFS_PROPS_H
+
+#include "ctree.h"
+
+void __init btrfs_props_init(void);
+
+int btrfs_set_prop(struct inode *inode,
+		   const char *name,
+		   const char *value,
+		   size_t value_len,
+		   int flags);
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+			      struct inode *inode,
+			      struct inode *dir);
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_root *parent_root);
+
+#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 00000000000..98cb6b2630f
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,2651 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/btrfs.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "qgroup.h"
+
+/* TODO XXX FIXME
+ *  - subvol delete -> delete when ref goes to 0? delete limits also?
+ *  - reorganize keys
+ *  - compressed
+ *  - sync
+ *  - copy also limits on subvol creation
+ *  - limit
+ *  - caches fuer ulists
+ *  - performance benchmarks
+ *  - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+	u64 qgroupid;
+
+	/*
+	 * state
+	 */
+	u64 rfer;	/* referenced */
+	u64 rfer_cmpr;	/* referenced compressed */
+	u64 excl;	/* exclusive */
+	u64 excl_cmpr;	/* exclusive compressed */
+
+	/*
+	 * limits
+	 */
+	u64 lim_flags;	/* which limits are set */
+	u64 max_rfer;
+	u64 max_excl;
+	u64 rsv_rfer;
+	u64 rsv_excl;
+
+	/*
+	 * reservation tracking
+	 */
+	u64 reserved;
+
+	/*
+	 * lists
+	 */
+	struct list_head groups;  /* groups this group is member of */
+	struct list_head members; /* groups that are members of this group */
+	struct list_head dirty;   /* dirty groups */
+	struct rb_node node;	  /* tree of qgroups */
+
+	/*
+	 * temp variables for accounting operations
+	 */
+	u64 old_refcnt;
+	u64 new_refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+	struct list_head next_group;
+	struct list_head next_member;
+	struct btrfs_qgroup *group;
+	struct btrfs_qgroup *member;
+};
+
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+		   int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+
+/* must be called with qgroup_ioctl_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+					   u64 qgroupid)
+{
+	struct rb_node *n = fs_info->qgroup_tree.rb_node;
+	struct btrfs_qgroup *qgroup;
+
+	while (n) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		if (qgroup->qgroupid < qgroupid)
+			n = n->rb_left;
+		else if (qgroup->qgroupid > qgroupid)
+			n = n->rb_right;
+		else
+			return qgroup;
+	}
+	return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+					  u64 qgroupid)
+{
+	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_qgroup *qgroup;
+
+	while (*p) {
+		parent = *p;
+		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+		if (qgroup->qgroupid < qgroupid)
+			p = &(*p)->rb_left;
+		else if (qgroup->qgroupid > qgroupid)
+			p = &(*p)->rb_right;
+		else
+			return qgroup;
+	}
+
+	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+	if (!qgroup)
+		return ERR_PTR(-ENOMEM);
+
+	qgroup->qgroupid = qgroupid;
+	INIT_LIST_HEAD(&qgroup->groups);
+	INIT_LIST_HEAD(&qgroup->members);
+	INIT_LIST_HEAD(&qgroup->dirty);
+
+	rb_link_node(&qgroup->node, parent, p);
+	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+	return qgroup;
+}
+
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+{
+	struct btrfs_qgroup_list *list;
+
+	list_del(&qgroup->dirty);
+	while (!list_empty(&qgroup->groups)) {
+		list = list_first_entry(&qgroup->groups,
+					struct btrfs_qgroup_list, next_group);
+		list_del(&list->next_group);
+		list_del(&list->next_member);
+		kfree(list);
+	}
+
+	while (!list_empty(&qgroup->members)) {
+		list = list_first_entry(&qgroup->members,
+					struct btrfs_qgroup_list, next_member);
+		list_del(&list->next_group);
+		list_del(&list->next_member);
+		kfree(list);
+	}
+	kfree(qgroup);
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+
+	if (!qgroup)
+		return -ENOENT;
+
+	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+	__del_qgroup_rb(qgroup);
+	return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+			   u64 memberid, u64 parentid)
+{
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup_list *list;
+
+	member = find_qgroup_rb(fs_info, memberid);
+	parent = find_qgroup_rb(fs_info, parentid);
+	if (!member || !parent)
+		return -ENOENT;
+
+	list = kzalloc(sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	list->group = parent;
+	list->member = member;
+	list_add_tail(&list->next_group, &member->groups);
+	list_add_tail(&list->next_member, &parent->members);
+
+	return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+			   u64 memberid, u64 parentid)
+{
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup_list *list;
+
+	member = find_qgroup_rb(fs_info, memberid);
+	parent = find_qgroup_rb(fs_info, parentid);
+	if (!member || !parent)
+		return -ENOENT;
+
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent) {
+			list_del(&list->next_group);
+			list_del(&list->next_member);
+			kfree(list);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+			       u64 rfer, u64 excl)
+{
+	struct btrfs_qgroup *qgroup;
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup)
+		return -EINVAL;
+	if (qgroup->rfer != rfer || qgroup->excl != excl)
+		return -EINVAL;
+	return 0;
+}
+#endif
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *l;
+	int slot;
+	int ret = 0;
+	u64 flags = 0;
+	u64 rescan_progress = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* default this to quota off, in case no status key is found */
+	fs_info->qgroup_flags = 0;
+
+	/*
+	 * pass 1: read status, all qgroup infos and limits
+	 */
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+	if (ret)
+		goto out;
+
+	while (1) {
+		struct btrfs_qgroup *qgroup;
+
+		slot = path->slots[0];
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+			struct btrfs_qgroup_status_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_status_item);
+
+			if (btrfs_qgroup_status_version(l, ptr) !=
+			    BTRFS_QGROUP_STATUS_VERSION) {
+				btrfs_err(fs_info,
+				 "old qgroup version, quota disabled");
+				goto out;
+			}
+			if (btrfs_qgroup_status_generation(l, ptr) !=
+			    fs_info->generation) {
+				flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+				btrfs_err(fs_info,
+					"qgroup generation mismatch, "
+					"marked as inconsistent");
+			}
+			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+									  ptr);
+			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
+			goto next1;
+		}
+
+		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+			goto next1;
+
+		qgroup = find_qgroup_rb(fs_info, found_key.offset);
+		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+			btrfs_err(fs_info, "inconsitent qgroup config");
+			flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		}
+		if (!qgroup) {
+			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			if (IS_ERR(qgroup)) {
+				ret = PTR_ERR(qgroup);
+				goto out;
+			}
+		}
+		switch (found_key.type) {
+		case BTRFS_QGROUP_INFO_KEY: {
+			struct btrfs_qgroup_info_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_info_item);
+			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+			/* generation currently unused */
+			break;
+		}
+		case BTRFS_QGROUP_LIMIT_KEY: {
+			struct btrfs_qgroup_limit_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_limit_item);
+			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+			break;
+		}
+		}
+next1:
+		ret = btrfs_next_item(quota_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * pass 2: read all qgroup relations
+	 */
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+	if (ret)
+		goto out;
+	while (1) {
+		slot = path->slots[0];
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+			goto next2;
+
+		if (found_key.objectid > found_key.offset) {
+			/* parent <- member, not needed to build config */
+			/* FIXME should we omit the key completely? */
+			goto next2;
+		}
+
+		ret = add_relation_rb(fs_info, found_key.objectid,
+				      found_key.offset);
+		if (ret == -ENOENT) {
+			btrfs_warn(fs_info,
+				"orphan qgroup relation 0x%llx->0x%llx",
+				found_key.objectid, found_key.offset);
+			ret = 0;	/* ignore the error */
+		}
+		if (ret)
+			goto out;
+next2:
+		ret = btrfs_next_item(quota_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+	}
+out:
+	fs_info->qgroup_flags |= flags;
+	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+		fs_info->quota_enabled = 0;
+		fs_info->pending_quota_state = 0;
+	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+		   ret >= 0) {
+		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+	}
+	btrfs_free_path(path);
+
+	if (ret < 0) {
+		ulist_free(fs_info->qgroup_ulist);
+		fs_info->qgroup_ulist = NULL;
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
+ * first two are in single-threaded paths.And for the third one, we have set
+ * quota_root to be null with qgroup_lock held before, so it is safe to clean
+ * up the in-memory structures without qgroup_lock held.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup *qgroup;
+
+	while ((n = rb_first(&fs_info->qgroup_tree))) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		rb_erase(n, &fs_info->qgroup_tree);
+		__del_qgroup_rb(qgroup);
+	}
+	/*
+	 * we call btrfs_free_qgroup_config() when umounting
+	 * filesystem and disabling quota, so we set qgroup_ulit
+	 * to be null here to avoid double free.
+	 */
+	ulist_free(fs_info->qgroup_ulist);
+	fs_info->qgroup_ulist = NULL;
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *quota_root,
+				    u64 src, u64 dst)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = src;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = dst;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *quota_root,
+				    u64 src, u64 dst)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = src;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = dst;
+
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *quota_root, u64 qgroupid)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_qgroup_info_item *qgroup_info;
+	struct btrfs_qgroup_limit_item *qgroup_limit;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+		return 0;
+#endif
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroupid;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*qgroup_info));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+				 struct btrfs_qgroup_info_item);
+	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_release_path(path);
+
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*qgroup_limit));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+				  struct btrfs_qgroup_limit_item);
+	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *quota_root, u64 qgroupid)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroupid;
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root, u64 qgroupid,
+				    u64 flags, u64 max_rfer, u64 max_excl,
+				    u64 rsv_rfer, u64 rsv_excl)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_limit_item *qgroup_limit;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	key.offset = qgroupid;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
+	btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
+	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
+	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
+	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
+	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_qgroup *qgroup)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_info_item *qgroup_info;
+	int ret;
+	int slot;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+		return 0;
+#endif
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroup->qgroupid;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
+	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				    struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_status_item *ptr;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_STATUS_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+	btrfs_set_qgroup_status_rescan(l, ptr,
+				fs_info->qgroup_rescan_progress.objectid);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf = NULL;
+	int ret;
+	int nr = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	key.objectid = 0;
+	key.offset = 0;
+	key.type = 0;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			goto out;
+		leaf = path->nodes[0];
+		nr = btrfs_header_nritems(leaf);
+		if (!nr)
+			break;
+		/*
+		 * delete the leaf one by one
+		 * since the whole tree is going
+		 * to be deleted.
+		 */
+		path->slots[0] = 0;
+		ret = btrfs_del_items(trans, root, path, 0, nr);
+		if (ret)
+			goto out;
+
+		btrfs_release_path(path);
+	}
+	ret = 0;
+out:
+	root->fs_info->pending_quota_state = 0;
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_path *path = NULL;
+	struct btrfs_qgroup_status_item *ptr;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_qgroup *qgroup = NULL;
+	int ret = 0;
+	int slot;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (fs_info->quota_root) {
+		fs_info->pending_quota_state = 1;
+		goto out;
+	}
+
+	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * initially create the quota tree
+	 */
+	quota_root = btrfs_create_tree(trans, fs_info,
+				       BTRFS_QUOTA_TREE_OBJECTID);
+	if (IS_ERR(quota_root)) {
+		ret =  PTR_ERR(quota_root);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out_free_root;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_STATUS_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*ptr));
+	if (ret)
+		goto out_free_path;
+
+	leaf = path->nodes[0];
+	ptr = btrfs_item_ptr(leaf, path->slots[0],
+				 struct btrfs_qgroup_status_item);
+	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = 0;
+
+	btrfs_release_path(path);
+	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
+	if (ret > 0)
+		goto out_add_root;
+	if (ret < 0)
+		goto out_free_path;
+
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.type == BTRFS_ROOT_REF_KEY) {
+			ret = add_qgroup_item(trans, quota_root,
+					      found_key.offset);
+			if (ret)
+				goto out_free_path;
+
+			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			if (IS_ERR(qgroup)) {
+				ret = PTR_ERR(qgroup);
+				goto out_free_path;
+			}
+		}
+		ret = btrfs_next_item(tree_root, path);
+		if (ret < 0)
+			goto out_free_path;
+		if (ret)
+			break;
+	}
+
+out_add_root:
+	btrfs_release_path(path);
+	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
+	if (ret)
+		goto out_free_path;
+
+	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
+	if (IS_ERR(qgroup)) {
+		ret = PTR_ERR(qgroup);
+		goto out_free_path;
+	}
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_root = quota_root;
+	fs_info->pending_quota_state = 1;
+	spin_unlock(&fs_info->qgroup_lock);
+out_free_path:
+	btrfs_free_path(path);
+out_free_root:
+	if (ret) {
+		free_extent_buffer(quota_root->node);
+		free_extent_buffer(quota_root->commit_root);
+		kfree(quota_root);
+	}
+out:
+	if (ret) {
+		ulist_free(fs_info->qgroup_ulist);
+		fs_info->qgroup_ulist = NULL;
+	}
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *quota_root;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (!fs_info->quota_root)
+		goto out;
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	quota_root = fs_info->quota_root;
+	fs_info->quota_root = NULL;
+	spin_unlock(&fs_info->qgroup_lock);
+
+	btrfs_free_qgroup_config(fs_info);
+
+	ret = btrfs_clean_quota_tree(trans, quota_root);
+	if (ret)
+		goto out;
+
+	ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+	if (ret)
+		goto out;
+
+	list_del(&quota_root->dirty_list);
+
+	btrfs_tree_lock(quota_root->node);
+	clean_tree_block(trans, tree_root, quota_root->node);
+	btrfs_tree_unlock(quota_root->node);
+	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+	free_extent_buffer(quota_root->node);
+	free_extent_buffer(quota_root->commit_root);
+	kfree(quota_root);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+			 struct btrfs_qgroup *qgroup)
+{
+	if (list_empty(&qgroup->dirty))
+		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup_list *list;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+	member = find_qgroup_rb(fs_info, src);
+	parent = find_qgroup_rb(fs_info, dst);
+	if (!member || !parent) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* check if such qgroup relation exist firstly */
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+	if (ret)
+		goto out;
+
+	ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+	if (ret) {
+		del_qgroup_relation_item(trans, quota_root, src, dst);
+		goto out;
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+	ret = add_relation_rb(quota_root->fs_info, src, dst);
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup_list *list;
+	int ret = 0;
+	int err;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	member = find_qgroup_rb(fs_info, src);
+	parent = find_qgroup_rb(fs_info, dst);
+	if (!member || !parent) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* check if such qgroup relation exist firstly */
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent)
+			goto exist;
+	}
+	ret = -ENOENT;
+	goto out;
+exist:
+	ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+	err = del_qgroup_relation_item(trans, quota_root, dst, src);
+	if (err && !ret)
+		ret = err;
+
+	spin_lock(&fs_info->qgroup_lock);
+	del_relation_rb(fs_info, src, dst);
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (qgroup) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	ret = add_qgroup_item(trans, quota_root, qgroupid);
+	if (ret)
+		goto out;
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = add_qgroup_rb(fs_info, qgroupid);
+	spin_unlock(&fs_info->qgroup_lock);
+
+	if (IS_ERR(qgroup))
+		ret = PTR_ERR(qgroup);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	} else {
+		/* check if there are no relations to this qgroup */
+		if (!list_empty(&qgroup->groups) ||
+		    !list_empty(&qgroup->members)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+	ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+	spin_lock(&fs_info->qgroup_lock);
+	del_qgroup_rb(quota_root->fs_info, qgroupid);
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
+				       limit->flags, limit->max_rfer,
+				       limit->max_excl, limit->rsv_rfer,
+				       limit->rsv_excl);
+	if (ret) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		btrfs_info(fs_info, "unable to update quota limit for %llu",
+		       qgroupid);
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup->lim_flags = limit->flags;
+	qgroup->max_rfer = limit->max_rfer;
+	qgroup->max_excl = limit->max_excl;
+	qgroup->rsv_rfer = limit->rsv_rfer;
+	qgroup->rsv_excl = limit->rsv_excl;
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+		     struct btrfs_qgroup_operation *oper2)
+{
+	if (oper1->bytenr < oper2->bytenr)
+		return -1;
+	if (oper1->bytenr > oper2->bytenr)
+		return 1;
+	if (oper1->seq < oper2->seq)
+		return -1;
+	if (oper1->seq > oper2->seq)
+		return -1;
+	if (oper1->ref_root < oper2->ref_root)
+		return -1;
+	if (oper1->ref_root > oper2->ref_root)
+		return 1;
+	if (oper1->type < oper2->type)
+		return -1;
+	if (oper1->type > oper2->type)
+		return 1;
+	return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+			      struct btrfs_qgroup_operation *oper)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_qgroup_operation *cur;
+	int cmp;
+
+	spin_lock(&fs_info->qgroup_op_lock);
+	p = &fs_info->qgroup_op_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+		cmp = comp_oper(cur, oper);
+		if (cmp < 0) {
+			p = &(*p)->rb_right;
+		} else if (cmp) {
+			p = &(*p)->rb_left;
+		} else {
+			spin_unlock(&fs_info->qgroup_op_lock);
+			return -EEXIST;
+		}
+	}
+	rb_link_node(&oper->n, parent, p);
+	rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+	spin_unlock(&fs_info->qgroup_op_lock);
+	return 0;
+}
+
+/*
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point.  If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info, u64 ref_root,
+			    u64 bytenr, u64 num_bytes,
+			    enum btrfs_qgroup_operation_type type, int mod_seq)
+{
+	struct btrfs_qgroup_operation *oper;
+	int ret;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		return 0;
+
+	oper = kmalloc(sizeof(*oper), GFP_NOFS);
+	if (!oper)
+		return -ENOMEM;
+
+	oper->ref_root = ref_root;
+	oper->bytenr = bytenr;
+	oper->num_bytes = num_bytes;
+	oper->type = type;
+	oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+	INIT_LIST_HEAD(&oper->elem.list);
+	oper->elem.seq = 0;
+	ret = insert_qgroup_oper(fs_info, oper);
+	if (ret) {
+		/* Shouldn't happen so have an assert for developers */
+		ASSERT(0);
+		kfree(oper);
+		return ret;
+	}
+	list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+	if (mod_seq)
+		btrfs_get_tree_mod_seq(fs_info, &oper->elem);
+
+	return 0;
+}
+
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+				  struct btrfs_qgroup_operation *oper)
+{
+	struct btrfs_qgroup *qgroup;
+	struct ulist *tmp;
+	struct btrfs_qgroup_list *glist;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int sign = 0;
+	int ret = 0;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
+	spin_lock(&fs_info->qgroup_lock);
+	if (!fs_info->quota_root)
+		goto out;
+	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+	if (!qgroup)
+		goto out;
+	switch (oper->type) {
+	case BTRFS_QGROUP_OPER_ADD_EXCL:
+		sign = 1;
+		break;
+	case BTRFS_QGROUP_OPER_SUB_EXCL:
+		sign = -1;
+		break;
+	default:
+		ASSERT(0);
+	}
+	qgroup->rfer += sign * oper->num_bytes;
+	qgroup->rfer_cmpr += sign * oper->num_bytes;
+
+	WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
+	qgroup->excl += sign * oper->num_bytes;
+	qgroup->excl_cmpr += sign * oper->num_bytes;
+
+	qgroup_dirty(fs_info, qgroup);
+
+	/* Get all of the parent groups that contain this qgroup */
+	list_for_each_entry(glist, &qgroup->groups, next_group) {
+		ret = ulist_add(tmp, glist->group->qgroupid,
+				ptr_to_u64(glist->group), GFP_ATOMIC);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* Iterate all of the parents and adjust their reference counts */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		qgroup = u64_to_ptr(unode->aux);
+		qgroup->rfer += sign * oper->num_bytes;
+		qgroup->rfer_cmpr += sign * oper->num_bytes;
+		qgroup->excl += sign * oper->num_bytes;
+		if (sign < 0)
+			WARN_ON(qgroup->excl < oper->num_bytes);
+		qgroup->excl_cmpr += sign * oper->num_bytes;
+		qgroup_dirty(fs_info, qgroup);
+
+		/* Add any parents of the parents */
+		list_for_each_entry(glist, &qgroup->groups, next_group) {
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+	ret = 0;
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	ulist_free(tmp);
+	return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+				  u64 root_to_skip, struct ulist *tmp,
+				  struct ulist *roots, struct ulist *qgroups,
+				  u64 seq, int *old_roots, int rescan)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct ulist_node *tmp_unode;
+	struct ulist_iterator tmp_uiter;
+	struct btrfs_qgroup *qg;
+	int ret;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		/* We don't count our current root here */
+		if (unode->val == root_to_skip)
+			continue;
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+		/*
+		 * We could have a pending removal of this same ref so we may
+		 * not have actually found our ref root when doing
+		 * btrfs_find_all_roots, so we need to keep track of how many
+		 * old roots we find in case we removed ours and added a
+		 * different one at the same time.  I don't think this could
+		 * happen in practice but that sort of thinking leads to pain
+		 * and suffering and to the dark side.
+		 */
+		(*old_roots)++;
+
+		ulist_reinit(tmp);
+		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+				GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = u64_to_ptr(tmp_unode->aux);
+			/*
+			 * We use this sequence number to keep from having to
+			 * run the whole list and 0 out the refcnt every time.
+			 * We basically use sequnce as the known 0 count and
+			 * then add 1 everytime we see a qgroup.  This is how we
+			 * get how many of the roots actually point up to the
+			 * upper level qgroups in order to determine exclusive
+			 * counts.
+			 *
+			 * For rescan we want to set old_refcnt to seq so our
+			 * exclusive calculations end up correct.
+			 */
+			if (rescan)
+				qg->old_refcnt = seq;
+			else if (qg->old_refcnt < seq)
+				qg->old_refcnt = seq + 1;
+			else
+				qg->old_refcnt++;
+
+			if (qg->new_refcnt < seq)
+				qg->new_refcnt = seq + 1;
+			else
+				qg->new_refcnt++;
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ret = ulist_add(qgroups, glist->group->qgroupid,
+						ptr_to_u64(glist->group),
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+				ret = ulist_add(tmp, glist->group->qgroupid,
+						ptr_to_u64(glist->group),
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+				       struct btrfs_qgroup_operation *oper,
+				       struct ulist *tmp,
+				       struct ulist *qgroups, u64 seq,
+				       int *old_roots)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	struct btrfs_qgroup_operation *tmp_oper;
+	struct rb_node *n;
+	int ret;
+
+	ulist_reinit(tmp);
+
+	/*
+	 * We only walk forward in the tree since we're only interested in
+	 * removals that happened _after_  our operation.
+	 */
+	spin_lock(&fs_info->qgroup_op_lock);
+	n = rb_next(&oper->n);
+	spin_unlock(&fs_info->qgroup_op_lock);
+	if (!n)
+		return 0;
+	tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+	while (tmp_oper->bytenr == oper->bytenr) {
+		/*
+		 * If it's not a removal we don't care, additions work out
+		 * properly with our refcnt tracking.
+		 */
+		if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+		    tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+			goto next;
+		qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+		if (!qg)
+			goto next;
+		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+				GFP_ATOMIC);
+		if (ret) {
+			if (ret < 0)
+				return ret;
+			/*
+			 * We only want to increase old_roots if this qgroup is
+			 * not already in the list of qgroups.  If it is already
+			 * there then that means it must have been re-added or
+			 * the delete will be discarded because we had an
+			 * existing ref that we haven't looked up yet.  In this
+			 * case we don't want to increase old_roots.  So if ret
+			 * == 1 then we know that this is the first time we've
+			 * seen this qgroup and we can bump the old_roots.
+			 */
+			(*old_roots)++;
+			ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+					GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+next:
+		spin_lock(&fs_info->qgroup_op_lock);
+		n = rb_next(&tmp_oper->n);
+		spin_unlock(&fs_info->qgroup_op_lock);
+		if (!n)
+			break;
+		tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+	}
+
+	/* Ok now process the qgroups we found */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+		if (qg->old_refcnt < seq)
+			qg->old_refcnt = seq + 1;
+		else
+			qg->old_refcnt++;
+		if (qg->new_refcnt < seq)
+			qg->new_refcnt = seq + 1;
+		else
+			qg->new_refcnt++;
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(qgroups, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+				  struct btrfs_qgroup_operation *oper,
+				  struct btrfs_qgroup *qgroup,
+				  struct ulist *tmp, struct ulist *qgroups,
+				  u64 seq)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	int ret;
+
+	ulist_reinit(tmp);
+	ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+			GFP_ATOMIC);
+	if (ret < 0)
+		return ret;
+	ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+			GFP_ATOMIC);
+	if (ret < 0)
+		return ret;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+			if (qg->new_refcnt < seq)
+				qg->new_refcnt = seq + 1;
+			else
+				qg->new_refcnt++;
+		} else {
+			if (qg->old_refcnt < seq)
+				qg->old_refcnt = seq + 1;
+			else
+				qg->old_refcnt++;
+		}
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+			ret = ulist_add(qgroups, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+				  u64 root_to_skip, u64 num_bytes,
+				  struct ulist *qgroups, u64 seq,
+				  int old_roots, int new_roots, int rescan)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	u64 cur_new_count, cur_old_count;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(qgroups, &uiter))) {
+		bool dirty = false;
+
+		qg = u64_to_ptr(unode->aux);
+		/*
+		 * Wasn't referenced before but is now, add to the reference
+		 * counters.
+		 */
+		if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+			qg->rfer += num_bytes;
+			qg->rfer_cmpr += num_bytes;
+			dirty = true;
+		}
+
+		/*
+		 * Was referenced before but isn't now, subtract from the
+		 * reference counters.
+		 */
+		if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+			qg->rfer -= num_bytes;
+			qg->rfer_cmpr -= num_bytes;
+			dirty = true;
+		}
+
+		if (qg->old_refcnt < seq)
+			cur_old_count = 0;
+		else
+			cur_old_count = qg->old_refcnt - seq;
+		if (qg->new_refcnt < seq)
+			cur_new_count = 0;
+		else
+			cur_new_count = qg->new_refcnt - seq;
+
+		/*
+		 * If our refcount was the same as the roots previously but our
+		 * new count isn't the same as the number of roots now then we
+		 * went from having a exclusive reference on this range to not.
+		 */
+		if (old_roots && cur_old_count == old_roots &&
+		    (cur_new_count != new_roots || new_roots == 0)) {
+			WARN_ON(cur_new_count != new_roots && new_roots == 0);
+			qg->excl -= num_bytes;
+			qg->excl_cmpr -= num_bytes;
+			dirty = true;
+		}
+
+		/*
+		 * If we didn't reference all the roots before but now we do we
+		 * have an exclusive reference to this range.
+		 */
+		if ((!old_roots || (old_roots && cur_old_count != old_roots))
+		    && cur_new_count == new_roots) {
+			qg->excl += num_bytes;
+			qg->excl_cmpr += num_bytes;
+			dirty = true;
+		}
+
+		if (dirty)
+			qgroup_dirty(fs_info, qg);
+	}
+	return 0;
+}
+
+/*
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr.  If we do then we can just discard this operation.
+ */
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+			       struct btrfs_fs_info *fs_info,
+			       struct btrfs_qgroup_operation *oper)
+{
+	struct ulist *roots = NULL;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret = 0;
+
+	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+				   oper->elem.seq, &roots);
+	if (ret < 0)
+		return ret;
+	ret = 0;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		if (unode->val == oper->ref_root) {
+			ret = 1;
+			break;
+		}
+	}
+	ulist_free(roots);
+	btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+
+	return ret;
+}
+
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters.  The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count.  Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count.  This means that if
+ * anybody is equal to or below this sequence they were never referenced.  We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently.  This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently.  We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ *			[qgroup 1/0]
+ *		     /         |          \
+ *		[qg 0/0]   [qg 0/1]	[qg 0/2]
+ *		   \          |            /
+ *		  [	   extent	    ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0.  The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes.  Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info,
+				    struct btrfs_qgroup_operation *oper)
+{
+	struct ulist *roots = NULL;
+	struct ulist *qgroups, *tmp;
+	struct btrfs_qgroup *qgroup;
+	struct seq_list elem = {};
+	u64 seq;
+	int old_roots = 0;
+	int new_roots = 0;
+	int ret = 0;
+
+	if (oper->elem.seq) {
+		ret = check_existing_refs(trans, fs_info, oper);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			return 0;
+	}
+
+	qgroups = ulist_alloc(GFP_NOFS);
+	if (!qgroups)
+		return -ENOMEM;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp) {
+		ulist_free(qgroups);
+		return -ENOMEM;
+	}
+
+	btrfs_get_tree_mod_seq(fs_info, &elem);
+	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+				   &roots);
+	btrfs_put_tree_mod_seq(fs_info, &elem);
+	if (ret < 0) {
+		ulist_free(qgroups);
+		ulist_free(tmp);
+		return ret;
+	}
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+	if (!qgroup)
+		goto out;
+	seq = fs_info->qgroup_seq;
+
+	/*
+	 * So roots is the list of all the roots currently pointing at the
+	 * bytenr, including the ref we are adding if we are adding, or not if
+	 * we are removing a ref.  So we pass in the ref_root to skip that root
+	 * in our calculations.  We set old_refnct and new_refcnt cause who the
+	 * hell knows what everything looked like before, and it doesn't matter
+	 * except...
+	 */
+	ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+				     seq, &old_roots, 0);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Now adjust the refcounts of the qgroups that care about this
+	 * reference, either the old_count in the case of removal or new_count
+	 * in the case of an addition.
+	 */
+	ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+				     seq);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * ...in the case of removals.  If we had a removal before we got around
+	 * to processing this operation then we need to find that guy and count
+	 * his references as if they really existed so we don't end up screwing
+	 * up the exclusive counts.  Then whenever we go to process the delete
+	 * everything will be grand and we can account for whatever exclusive
+	 * changes need to be made there.  We also have to pass in old_roots so
+	 * we have an accurate count of the roots as it pertains to this
+	 * operations view of the world.
+	 */
+	ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+					  &old_roots);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * We are adding our root, need to adjust up the number of roots,
+	 * otherwise old_roots is the number of roots we want.
+	 */
+	if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+		new_roots = old_roots + 1;
+	} else {
+		new_roots = old_roots;
+		old_roots++;
+	}
+	fs_info->qgroup_seq += old_roots + 1;
+
+
+	/*
+	 * And now the magic happens, bless Arne for having a pretty elegant
+	 * solution for this.
+	 */
+	qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+			       qgroups, seq, old_roots, new_roots, 0);
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	ulist_free(qgroups);
+	ulist_free(roots);
+	ulist_free(tmp);
+	return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info,
+				struct btrfs_qgroup_operation *oper)
+{
+	int ret = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	BUG_ON(!fs_info->quota_root);
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			return 0;
+		}
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	ASSERT(is_fstree(oper->ref_root));
+
+	switch (oper->type) {
+	case BTRFS_QGROUP_OPER_ADD_EXCL:
+	case BTRFS_QGROUP_OPER_SUB_EXCL:
+		ret = qgroup_excl_accounting(fs_info, oper);
+		break;
+	case BTRFS_QGROUP_OPER_ADD_SHARED:
+	case BTRFS_QGROUP_OPER_SUB_SHARED:
+		ret = qgroup_shared_accounting(trans, fs_info, oper);
+		break;
+	default:
+		ASSERT(0);
+	}
+	return ret;
+}
+
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_qgroup_operation *oper;
+	int ret = 0;
+
+	while (!list_empty(&trans->qgroup_ref_list)) {
+		oper = list_first_entry(&trans->qgroup_ref_list,
+					struct btrfs_qgroup_operation, list);
+		list_del_init(&oper->list);
+		if (!ret || !trans->aborted)
+			ret = btrfs_qgroup_account(trans, fs_info, oper);
+		spin_lock(&fs_info->qgroup_op_lock);
+		rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+		spin_unlock(&fs_info->qgroup_op_lock);
+		btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+		kfree(oper);
+	}
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	int ret = 0;
+	int start_rescan_worker = 0;
+
+	if (!quota_root)
+		goto out;
+
+	if (!fs_info->quota_enabled && fs_info->pending_quota_state)
+		start_rescan_worker = 1;
+
+	fs_info->quota_enabled = fs_info->pending_quota_state;
+
+	spin_lock(&fs_info->qgroup_lock);
+	while (!list_empty(&fs_info->dirty_qgroups)) {
+		struct btrfs_qgroup *qgroup;
+		qgroup = list_first_entry(&fs_info->dirty_qgroups,
+					  struct btrfs_qgroup, dirty);
+		list_del_init(&qgroup->dirty);
+		spin_unlock(&fs_info->qgroup_lock);
+		ret = update_qgroup_info_item(trans, quota_root, qgroup);
+		if (ret)
+			fs_info->qgroup_flags |=
+					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		spin_lock(&fs_info->qgroup_lock);
+	}
+	if (fs_info->quota_enabled)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+	else
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+	spin_unlock(&fs_info->qgroup_lock);
+
+	ret = update_qgroup_status_item(trans, fs_info, quota_root);
+	if (ret)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+	if (!ret && start_rescan_worker) {
+		ret = qgroup_rescan_init(fs_info, 0, 1);
+		if (!ret) {
+			qgroup_rescan_zero_tracking(fs_info);
+			btrfs_queue_work(fs_info->qgroup_rescan_workers,
+					 &fs_info->qgroup_rescan_work);
+		}
+		ret = 0;
+	}
+
+out:
+
+	return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit)
+{
+	int ret = 0;
+	int i;
+	u64 *i_qgroups;
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_qgroup *srcgroup;
+	struct btrfs_qgroup *dstgroup;
+	u32 level_size = 0;
+	u64 nums;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (!fs_info->quota_enabled)
+		goto out;
+
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (inherit) {
+		i_qgroups = (u64 *)(inherit + 1);
+		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+		       2 * inherit->num_excl_copies;
+		for (i = 0; i < nums; ++i) {
+			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
+			if (!srcgroup) {
+				ret = -EINVAL;
+				goto out;
+			}
+			++i_qgroups;
+		}
+	}
+
+	/*
+	 * create a tracking group for the subvol itself
+	 */
+	ret = add_qgroup_item(trans, quota_root, objectid);
+	if (ret)
+		goto out;
+
+	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+		ret = update_qgroup_limit_item(trans, quota_root, objectid,
+					       inherit->lim.flags,
+					       inherit->lim.max_rfer,
+					       inherit->lim.max_excl,
+					       inherit->lim.rsv_rfer,
+					       inherit->lim.rsv_excl);
+		if (ret)
+			goto out;
+	}
+
+	if (srcid) {
+		struct btrfs_root *srcroot;
+		struct btrfs_key srckey;
+		int srcroot_level;
+
+		srckey.objectid = srcid;
+		srckey.type = BTRFS_ROOT_ITEM_KEY;
+		srckey.offset = (u64)-1;
+		srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+		if (IS_ERR(srcroot)) {
+			ret = PTR_ERR(srcroot);
+			goto out;
+		}
+
+		rcu_read_lock();
+		srcroot_level = btrfs_header_level(srcroot->node);
+		level_size = btrfs_level_size(srcroot, srcroot_level);
+		rcu_read_unlock();
+	}
+
+	/*
+	 * add qgroup to all inherited groups
+	 */
+	if (inherit) {
+		i_qgroups = (u64 *)(inherit + 1);
+		for (i = 0; i < inherit->num_qgroups; ++i) {
+			ret = add_qgroup_relation_item(trans, quota_root,
+						       objectid, *i_qgroups);
+			if (ret)
+				goto out;
+			ret = add_qgroup_relation_item(trans, quota_root,
+						       *i_qgroups, objectid);
+			if (ret)
+				goto out;
+			++i_qgroups;
+		}
+	}
+
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	dstgroup = add_qgroup_rb(fs_info, objectid);
+	if (IS_ERR(dstgroup)) {
+		ret = PTR_ERR(dstgroup);
+		goto unlock;
+	}
+
+	if (srcid) {
+		srcgroup = find_qgroup_rb(fs_info, srcid);
+		if (!srcgroup)
+			goto unlock;
+
+		/*
+		 * We call inherit after we clone the root in order to make sure
+		 * our counts don't go crazy, so at this point the only
+		 * difference between the two roots should be the root node.
+		 */
+		dstgroup->rfer = srcgroup->rfer;
+		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+		dstgroup->excl = level_size;
+		dstgroup->excl_cmpr = level_size;
+		srcgroup->excl = level_size;
+		srcgroup->excl_cmpr = level_size;
+		qgroup_dirty(fs_info, dstgroup);
+		qgroup_dirty(fs_info, srcgroup);
+	}
+
+	if (!inherit)
+		goto unlock;
+
+	i_qgroups = (u64 *)(inherit + 1);
+	for (i = 0; i < inherit->num_qgroups; ++i) {
+		ret = add_relation_rb(quota_root->fs_info, objectid,
+				      *i_qgroups);
+		if (ret)
+			goto unlock;
+		++i_qgroups;
+	}
+
+	for (i = 0; i <  inherit->num_ref_copies; ++i) {
+		struct btrfs_qgroup *src;
+		struct btrfs_qgroup *dst;
+
+		src = find_qgroup_rb(fs_info, i_qgroups[0]);
+		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+		if (!src || !dst) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		dst->rfer = src->rfer - level_size;
+		dst->rfer_cmpr = src->rfer_cmpr - level_size;
+		i_qgroups += 2;
+	}
+	for (i = 0; i <  inherit->num_excl_copies; ++i) {
+		struct btrfs_qgroup *src;
+		struct btrfs_qgroup *dst;
+
+		src = find_qgroup_rb(fs_info, i_qgroups[0]);
+		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+		if (!src || !dst) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		dst->excl = src->excl + level_size;
+		dst->excl_cmpr = src->excl_cmpr + level_size;
+		i_qgroups += 2;
+	}
+
+unlock:
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	return ret;
+}
+
+/*
+ * reserve some space for a qgroup and all its parents. The reservation takes
+ * place with start_transaction or dealloc_reserve, similar to ENOSPC
+ * accounting. If not enough space is available, EDQUOT is returned.
+ * We assume that the requested space is new for all qgroups.
+ */
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 ref_root = root->root_key.objectid;
+	int ret = 0;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+
+	if (!is_fstree(ref_root))
+		return 0;
+
+	if (num_bytes == 0)
+		return 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	/*
+	 * in a first step, we check all affected qgroups if any limits would
+	 * be exceeded
+	 */
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		goto out;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+
+		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+		    qg->reserved + (s64)qg->rfer + num_bytes >
+		    qg->max_rfer) {
+			ret = -EDQUOT;
+			goto out;
+		}
+
+		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+		    qg->reserved + (s64)qg->excl + num_bytes >
+		    qg->max_excl) {
+			ret = -EDQUOT;
+			goto out;
+		}
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+	ret = 0;
+	/*
+	 * no limits exceeded, now record the reservation into all qgroups
+	 */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+
+		qg = u64_to_ptr(unode->aux);
+
+		qg->reserved += num_bytes;
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	u64 ref_root = root->root_key.objectid;
+	int ret = 0;
+
+	if (!is_fstree(ref_root))
+		return;
+
+	if (num_bytes == 0)
+		return;
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		goto out;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+
+		qg->reserved -= num_bytes;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+		return;
+	btrfs_err(trans->root->fs_info,
+		"qgroups not uptodate in trans handle %p:  list is%s empty, "
+		"seq is %#x.%x",
+		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+		(u32)(trans->delayed_ref_elem.seq >> 32),
+		(u32)trans->delayed_ref_elem.seq);
+	BUG();
+}
+
+/*
+ * returns < 0 on error, 0 when more leafs are to be scanned.
+ * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ */
+static int
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		   struct btrfs_trans_handle *trans, struct ulist *qgroups,
+		   struct ulist *tmp, struct extent_buffer *scratch_leaf)
+{
+	struct btrfs_key found;
+	struct ulist *roots = NULL;
+	struct seq_list tree_mod_seq_elem = {};
+	u64 num_bytes;
+	u64 seq;
+	int new_roots;
+	int slot;
+	int ret;
+
+	path->leave_spinning = 1;
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	ret = btrfs_search_slot_for_read(fs_info->extent_root,
+					 &fs_info->qgroup_rescan_progress,
+					 path, 1, 0);
+
+	pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
+		 fs_info->qgroup_rescan_progress.objectid,
+		 fs_info->qgroup_rescan_progress.type,
+		 fs_info->qgroup_rescan_progress.offset, ret);
+
+	if (ret) {
+		/*
+		 * The rescan is about to end, we will not be scanning any
+		 * further blocks. We cannot unset the RESCAN flag here, because
+		 * we want to commit the transaction if everything went well.
+		 * To make the live accounting work in this phase, we set our
+		 * scan progress pointer such that every real extent objectid
+		 * will be smaller.
+		 */
+		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+		btrfs_release_path(path);
+		mutex_unlock(&fs_info->qgroup_rescan_lock);
+		return ret;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &found,
+			      btrfs_header_nritems(path->nodes[0]) - 1);
+	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
+
+	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+	slot = path->slots[0];
+	btrfs_release_path(path);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
+		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+		    found.type != BTRFS_METADATA_ITEM_KEY)
+			continue;
+		if (found.type == BTRFS_METADATA_ITEM_KEY)
+			num_bytes = fs_info->extent_root->leafsize;
+		else
+			num_bytes = found.offset;
+
+		ulist_reinit(qgroups);
+		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+					   &roots);
+		if (ret < 0)
+			goto out;
+		spin_lock(&fs_info->qgroup_lock);
+		seq = fs_info->qgroup_seq;
+		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+		new_roots = 0;
+		ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+					     seq, &new_roots, 1);
+		if (ret < 0) {
+			spin_unlock(&fs_info->qgroup_lock);
+			ulist_free(roots);
+			goto out;
+		}
+
+		ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+					     seq, 0, new_roots, 1);
+		if (ret < 0) {
+			spin_unlock(&fs_info->qgroup_lock);
+			ulist_free(roots);
+			goto out;
+		}
+		spin_unlock(&fs_info->qgroup_lock);
+		ulist_free(roots);
+	}
+out:
+	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+
+	return ret;
+}
+
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+						     qgroup_rescan_work);
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans = NULL;
+	struct ulist *tmp = NULL, *qgroups = NULL;
+	struct extent_buffer *scratch_leaf = NULL;
+	int err = -ENOMEM;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+	qgroups = ulist_alloc(GFP_NOFS);
+	if (!qgroups)
+		goto out;
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		goto out;
+	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
+	if (!scratch_leaf)
+		goto out;
+
+	err = 0;
+	while (!err) {
+		trans = btrfs_start_transaction(fs_info->fs_root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			break;
+		}
+		if (!fs_info->quota_enabled) {
+			err = -EINTR;
+		} else {
+			err = qgroup_rescan_leaf(fs_info, path, trans,
+						 qgroups, tmp, scratch_leaf);
+		}
+		if (err > 0)
+			btrfs_commit_transaction(trans, fs_info->fs_root);
+		else
+			btrfs_end_transaction(trans, fs_info->fs_root);
+	}
+
+out:
+	kfree(scratch_leaf);
+	ulist_free(qgroups);
+	ulist_free(tmp);
+	btrfs_free_path(path);
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+
+	if (err == 2 &&
+	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	} else if (err < 0) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (err >= 0) {
+		btrfs_info(fs_info, "qgroup scan completed%s",
+			err == 2 ? " (inconsistency flag cleared)" : "");
+	} else {
+		btrfs_err(fs_info, "qgroup scan failed with %d", err);
+	}
+
+	complete_all(&fs_info->qgroup_rescan_completion);
+}
+
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+		   int init_flags)
+{
+	int ret = 0;
+
+	if (!init_flags &&
+	    (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+	     !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+
+	if (init_flags) {
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+			ret = -EINPROGRESS;
+		else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+			ret = -EINVAL;
+
+		if (ret) {
+			spin_unlock(&fs_info->qgroup_lock);
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			goto err;
+		}
+
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	}
+
+	memset(&fs_info->qgroup_rescan_progress, 0,
+		sizeof(fs_info->qgroup_rescan_progress));
+	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	init_completion(&fs_info->qgroup_rescan_completion);
+
+	memset(&fs_info->qgroup_rescan_work, 0,
+	       sizeof(fs_info->qgroup_rescan_work));
+	btrfs_init_work(&fs_info->qgroup_rescan_work,
+			btrfs_qgroup_rescan_worker, NULL, NULL);
+
+	if (ret) {
+err:
+		btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup *qgroup;
+
+	spin_lock(&fs_info->qgroup_lock);
+	/* clear all current qgroup tracking information */
+	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		qgroup->rfer = 0;
+		qgroup->rfer_cmpr = 0;
+		qgroup->excl = 0;
+		qgroup->excl_cmpr = 0;
+	}
+	spin_unlock(&fs_info->qgroup_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+	struct btrfs_trans_handle *trans;
+
+	ret = qgroup_rescan_init(fs_info, 0, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * We have set the rescan_progress to 0, which means no more
+	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
+	 * However, btrfs_qgroup_account_ref may be right after its call
+	 * to btrfs_find_all_roots, in which case it would still do the
+	 * accounting.
+	 * To solve this, we're committing the transaction, which will
+	 * ensure we run all delayed refs and only after that, we are
+	 * going to clear all tracking information for a clean start.
+	 */
+
+	trans = btrfs_join_transaction(fs_info->fs_root);
+	if (IS_ERR(trans)) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+	if (ret) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		return ret;
+	}
+
+	qgroup_rescan_zero_tracking(fs_info);
+
+	btrfs_queue_work(fs_info->qgroup_rescan_workers,
+			 &fs_info->qgroup_rescan_work);
+
+	return 0;
+}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+	int running;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+	running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (running)
+		ret = wait_for_completion_interruptible(
+					&fs_info->qgroup_rescan_completion);
+
+	return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+		btrfs_queue_work(fs_info->qgroup_rescan_workers,
+				 &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 00000000000..5952ff1fbd7
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2014 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction.  The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding.  This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols.  This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+	BTRFS_QGROUP_OPER_ADD_EXCL,
+	BTRFS_QGROUP_OPER_ADD_SHARED,
+	BTRFS_QGROUP_OPER_SUB_EXCL,
+	BTRFS_QGROUP_OPER_SUB_SHARED,
+};
+
+struct btrfs_qgroup_operation {
+	u64 ref_root;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 seq;
+	enum btrfs_qgroup_operation_type type;
+	struct seq_list elem;
+	struct rb_node n;
+	struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid,
+			char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info, u64 ref_root,
+			    u64 bytenr, u64 num_bytes,
+			    enum btrfs_qgroup_operation_type type,
+			    int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+			       u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 00000000000..4a88f073fdd
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2097 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <linux/vmalloc.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT	1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT		2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT	3
+
+
+#define RBIO_CACHE_SIZE 1024
+
+struct btrfs_raid_bio {
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_bio *bbio;
+
+	/*
+	 * logical block numbers for the start of each stripe
+	 * The last one or two are p/q.  These are sorted,
+	 * so raid_map[0] is the start of our full stripe
+	 */
+	u64 *raid_map;
+
+	/* while we're doing rmw on a stripe
+	 * we put it into a hash table so we can
+	 * lock the stripe and merge more rbios
+	 * into it.
+	 */
+	struct list_head hash_list;
+
+	/*
+	 * LRU list for the stripe cache
+	 */
+	struct list_head stripe_cache;
+
+	/*
+	 * for scheduling work in the helper threads
+	 */
+	struct btrfs_work work;
+
+	/*
+	 * bio list and bio_list_lock are used
+	 * to add more bios into the stripe
+	 * in hopes of avoiding the full rmw
+	 */
+	struct bio_list bio_list;
+	spinlock_t bio_list_lock;
+
+	/* also protected by the bio_list_lock, the
+	 * plug list is used by the plugging code
+	 * to collect partial bios while plugged.  The
+	 * stripe locking code also uses it to hand off
+	 * the stripe lock to the next pending IO
+	 */
+	struct list_head plug_list;
+
+	/*
+	 * flags that tell us if it is safe to
+	 * merge with this bio
+	 */
+	unsigned long flags;
+
+	/* size of each individual stripe on disk */
+	int stripe_len;
+
+	/* number of data stripes (no p/q) */
+	int nr_data;
+
+	/*
+	 * set if we're doing a parity rebuild
+	 * for a read from higher up, which is handled
+	 * differently from a parity rebuild as part of
+	 * rmw
+	 */
+	int read_rebuild;
+
+	/* first bad stripe */
+	int faila;
+
+	/* second bad stripe (for raid6 use) */
+	int failb;
+
+	/*
+	 * number of pages needed to represent the full
+	 * stripe
+	 */
+	int nr_pages;
+
+	/*
+	 * size of all the bios in the bio_list.  This
+	 * helps us decide if the rbio maps to a full
+	 * stripe or not
+	 */
+	int bio_list_bytes;
+
+	atomic_t refs;
+
+	/*
+	 * these are two arrays of pointers.  We allocate the
+	 * rbio big enough to hold them both and setup their
+	 * locations when the rbio is allocated
+	 */
+
+	/* pointers to pages that we allocated for
+	 * reading/writing stripes directly from the disk (including P/Q)
+	 */
+	struct page **stripe_pages;
+
+	/*
+	 * pointers to the pages in the bio_list.  Stored
+	 * here for faster lookup
+	 */
+	struct page **bio_pages;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+	struct btrfs_stripe_hash_table *table;
+	struct btrfs_stripe_hash_table *x;
+	struct btrfs_stripe_hash *cur;
+	struct btrfs_stripe_hash *h;
+	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+	int i;
+	int table_size;
+
+	if (info->stripe_hash_table)
+		return 0;
+
+	/*
+	 * The table is large, starting with order 4 and can go as high as
+	 * order 7 in case lock debugging is turned on.
+	 *
+	 * Try harder to allocate and fallback to vmalloc to lower the chance
+	 * of a failing mount.
+	 */
+	table_size = sizeof(*table) + sizeof(*h) * num_entries;
+	table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!table) {
+		table = vzalloc(table_size);
+		if (!table)
+			return -ENOMEM;
+	}
+
+	spin_lock_init(&table->cache_lock);
+	INIT_LIST_HEAD(&table->stripe_cache);
+
+	h = table->table;
+
+	for (i = 0; i < num_entries; i++) {
+		cur = h + i;
+		INIT_LIST_HEAD(&cur->hash_list);
+		spin_lock_init(&cur->lock);
+		init_waitqueue_head(&cur->wait);
+	}
+
+	x = cmpxchg(&info->stripe_hash_table, NULL, table);
+	if (x) {
+		if (is_vmalloc_addr(x))
+			vfree(x);
+		else
+			kfree(x);
+	}
+	return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array.  We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	char *s;
+	char *d;
+	int ret;
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		return;
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (!rbio->bio_pages[i])
+			continue;
+
+		s = kmap(rbio->bio_pages[i]);
+		d = kmap(rbio->stripe_pages[i]);
+
+		memcpy(d, s, PAGE_CACHE_SIZE);
+
+		kunmap(rbio->bio_pages[i]);
+		kunmap(rbio->stripe_pages[i]);
+		SetPageUptodate(rbio->stripe_pages[i]);
+	}
+	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+	u64 num = rbio->raid_map[0];
+
+	/*
+	 * we shift down quite a bit.  We're using byte
+	 * addressing, and most of the lower bits are zeros.
+	 * This tends to upset hash_64, and it consistently
+	 * returns just one or two different values.
+	 *
+	 * shifting off the lower bits fixes things.
+	 */
+	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+	int i;
+	struct page *s;
+	struct page *d;
+
+	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+		return;
+
+	for (i = 0; i < dest->nr_pages; i++) {
+		s = src->stripe_pages[i];
+		if (!s || !PageUptodate(s)) {
+			continue;
+		}
+
+		d = dest->stripe_pages[i];
+		if (d)
+			__free_page(d);
+
+		dest->stripe_pages[i] = s;
+		src->stripe_pages[i] = NULL;
+	}
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination.  The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+		       struct btrfs_raid_bio *victim)
+{
+	bio_list_merge(&dest->bio_list, &victim->bio_list);
+	dest->bio_list_bytes += victim->bio_list_bytes;
+	bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache.  The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+	int bucket = rbio_bucket(rbio);
+	struct btrfs_stripe_hash_table *table;
+	struct btrfs_stripe_hash *h;
+	int freeit = 0;
+
+	/*
+	 * check the bit again under the hash table lock.
+	 */
+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+	h = table->table + bucket;
+
+	/* hold the lock for the bucket because we may be
+	 * removing it from the hash table
+	 */
+	spin_lock(&h->lock);
+
+	/*
+	 * hold the lock for the bio list because we need
+	 * to make sure the bio list is empty
+	 */
+	spin_lock(&rbio->bio_list_lock);
+
+	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+		list_del_init(&rbio->stripe_cache);
+		table->cache_size -= 1;
+		freeit = 1;
+
+		/* if the bio list isn't empty, this rbio is
+		 * still involved in an IO.  We take it out
+		 * of the cache list, and drop the ref that
+		 * was held for the list.
+		 *
+		 * If the bio_list was empty, we also remove
+		 * the rbio from the hash_table, and drop
+		 * the corresponding ref
+		 */
+		if (bio_list_empty(&rbio->bio_list)) {
+			if (!list_empty(&rbio->hash_list)) {
+				list_del_init(&rbio->hash_list);
+				atomic_dec(&rbio->refs);
+				BUG_ON(!list_empty(&rbio->plug_list));
+			}
+		}
+	}
+
+	spin_unlock(&rbio->bio_list_lock);
+	spin_unlock(&h->lock);
+
+	if (freeit)
+		__free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+
+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	__remove_rbio_from_cache(rbio);
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+	struct btrfs_raid_bio *rbio;
+
+	table = info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	while (!list_empty(&table->stripe_cache)) {
+		rbio = list_entry(table->stripe_cache.next,
+				  struct btrfs_raid_bio,
+				  stripe_cache);
+		__remove_rbio_from_cache(rbio);
+	}
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+	if (!info->stripe_hash_table)
+		return;
+	btrfs_clear_rbio_cache(info);
+	if (is_vmalloc_addr(info->stripe_hash_table))
+		vfree(info->stripe_hash_table);
+	else
+		kfree(info->stripe_hash_table);
+	info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache.  It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+
+	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	spin_lock(&rbio->bio_list_lock);
+
+	/* bump our ref if we were not in the list before */
+	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+		atomic_inc(&rbio->refs);
+
+	if (!list_empty(&rbio->stripe_cache)){
+		list_move(&rbio->stripe_cache, &table->stripe_cache);
+	} else {
+		list_add(&rbio->stripe_cache, &table->stripe_cache);
+		table->cache_size += 1;
+	}
+
+	spin_unlock(&rbio->bio_list_lock);
+
+	if (table->cache_size > RBIO_CACHE_SIZE) {
+		struct btrfs_raid_bio *found;
+
+		found = list_entry(table->stripe_cache.prev,
+				  struct btrfs_raid_bio,
+				  stripe_cache);
+
+		if (found != rbio)
+			__remove_rbio_from_cache(found);
+	}
+
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+	return;
+}
+
+/*
+ * helper function to run the xor_blocks api.  It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+	int src_off = 0;
+	int xor_src_cnt = 0;
+	void *dest = pages[src_cnt];
+
+	while(src_cnt > 0) {
+		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+		src_cnt -= xor_src_cnt;
+		src_off += xor_src_cnt;
+	}
+}
+
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+	unsigned long size = rbio->bio_list_bytes;
+	int ret = 1;
+
+	if (size != rbio->nr_data * rbio->stripe_len)
+		ret = 0;
+
+	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+	return ret;
+}
+
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+	ret = __rbio_is_full(rbio);
+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+	return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+			  struct btrfs_raid_bio *cur)
+{
+	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+		return 0;
+
+	/*
+	 * we can't merge with cached rbios, since the
+	 * idea is that when we merge the destination
+	 * rbio is going to run our IO for us.  We can
+	 * steal from cached rbio's though, other functions
+	 * handle that.
+	 */
+	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+	    test_bit(RBIO_CACHE_BIT, &cur->flags))
+		return 0;
+
+	if (last->raid_map[0] !=
+	    cur->raid_map[0])
+		return 0;
+
+	/* reads can't merge with writes */
+	if (last->read_rebuild !=
+	    cur->read_rebuild) {
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+		return NULL;
+
+	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+		PAGE_CACHE_SHIFT;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock.  rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet.  The rbio is given
+ * the lock and 0 is returned.  The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner.  The rbio is freed and the IO will
+ * start automatically along with the existing rbio.  1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list.  When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission.  If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+	int bucket = rbio_bucket(rbio);
+	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+	struct btrfs_raid_bio *cur;
+	struct btrfs_raid_bio *pending;
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+	struct btrfs_raid_bio *freeit = NULL;
+	struct btrfs_raid_bio *cache_drop = NULL;
+	int ret = 0;
+	int walk = 0;
+
+	spin_lock_irqsave(&h->lock, flags);
+	list_for_each_entry(cur, &h->hash_list, hash_list) {
+		walk++;
+		if (cur->raid_map[0] == rbio->raid_map[0]) {
+			spin_lock(&cur->bio_list_lock);
+
+			/* can we steal this cached rbio's pages? */
+			if (bio_list_empty(&cur->bio_list) &&
+			    list_empty(&cur->plug_list) &&
+			    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+				list_del_init(&cur->hash_list);
+				atomic_dec(&cur->refs);
+
+				steal_rbio(cur, rbio);
+				cache_drop = cur;
+				spin_unlock(&cur->bio_list_lock);
+
+				goto lockit;
+			}
+
+			/* can we merge into the lock owner? */
+			if (rbio_can_merge(cur, rbio)) {
+				merge_rbio(cur, rbio);
+				spin_unlock(&cur->bio_list_lock);
+				freeit = rbio;
+				ret = 1;
+				goto out;
+			}
+
+
+			/*
+			 * we couldn't merge with the running
+			 * rbio, see if we can merge with the
+			 * pending ones.  We don't have to
+			 * check for rmw_locked because there
+			 * is no way they are inside finish_rmw
+			 * right now
+			 */
+			list_for_each_entry(pending, &cur->plug_list,
+					    plug_list) {
+				if (rbio_can_merge(pending, rbio)) {
+					merge_rbio(pending, rbio);
+					spin_unlock(&cur->bio_list_lock);
+					freeit = rbio;
+					ret = 1;
+					goto out;
+				}
+			}
+
+			/* no merging, put us on the tail of the plug list,
+			 * our rbio will be started with the currently
+			 * running rbio unlocks
+			 */
+			list_add_tail(&rbio->plug_list, &cur->plug_list);
+			spin_unlock(&cur->bio_list_lock);
+			ret = 1;
+			goto out;
+		}
+	}
+lockit:
+	atomic_inc(&rbio->refs);
+	list_add(&rbio->hash_list, &h->hash_list);
+out:
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (cache_drop)
+		remove_rbio_from_cache(cache_drop);
+	if (freeit)
+		__free_raid_bio(freeit);
+	return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed.  If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bucket;
+	struct btrfs_stripe_hash *h;
+	unsigned long flags;
+	int keep_cache = 0;
+
+	bucket = rbio_bucket(rbio);
+	h = rbio->fs_info->stripe_hash_table->table + bucket;
+
+	if (list_empty(&rbio->plug_list))
+		cache_rbio(rbio);
+
+	spin_lock_irqsave(&h->lock, flags);
+	spin_lock(&rbio->bio_list_lock);
+
+	if (!list_empty(&rbio->hash_list)) {
+		/*
+		 * if we're still cached and there is no other IO
+		 * to perform, just leave this rbio here for others
+		 * to steal from later
+		 */
+		if (list_empty(&rbio->plug_list) &&
+		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+			keep_cache = 1;
+			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+			BUG_ON(!bio_list_empty(&rbio->bio_list));
+			goto done;
+		}
+
+		list_del_init(&rbio->hash_list);
+		atomic_dec(&rbio->refs);
+
+		/*
+		 * we use the plug list to hold all the rbios
+		 * waiting for the chance to lock this stripe.
+		 * hand the lock over to one of them.
+		 */
+		if (!list_empty(&rbio->plug_list)) {
+			struct btrfs_raid_bio *next;
+			struct list_head *head = rbio->plug_list.next;
+
+			next = list_entry(head, struct btrfs_raid_bio,
+					  plug_list);
+
+			list_del_init(&rbio->plug_list);
+
+			list_add(&next->hash_list, &h->hash_list);
+			atomic_inc(&next->refs);
+			spin_unlock(&rbio->bio_list_lock);
+			spin_unlock_irqrestore(&h->lock, flags);
+
+			if (next->read_rebuild)
+				async_read_rebuild(next);
+			else {
+				steal_rbio(rbio, next);
+				async_rmw_stripe(next);
+			}
+
+			goto done_nolock;
+		} else  if (waitqueue_active(&h->wait)) {
+			spin_unlock(&rbio->bio_list_lock);
+			spin_unlock_irqrestore(&h->lock, flags);
+			wake_up(&h->wait);
+			goto done_nolock;
+		}
+	}
+done:
+	spin_unlock(&rbio->bio_list_lock);
+	spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+	if (!keep_cache)
+		remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	int i;
+
+	WARN_ON(atomic_read(&rbio->refs) < 0);
+	if (!atomic_dec_and_test(&rbio->refs))
+		return;
+
+	WARN_ON(!list_empty(&rbio->stripe_cache));
+	WARN_ON(!list_empty(&rbio->hash_list));
+	WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i]) {
+			__free_page(rbio->stripe_pages[i]);
+			rbio->stripe_pages[i] = NULL;
+		}
+	}
+	kfree(rbio->raid_map);
+	kfree(rbio->bbio);
+	kfree(rbio);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	unlock_stripe(rbio);
+	__free_raid_bio(rbio);
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+	struct bio *cur = bio_list_get(&rbio->bio_list);
+	struct bio *next;
+	free_raid_bio(rbio);
+
+	while (cur) {
+		next = cur->bi_next;
+		cur->bi_next = NULL;
+		if (uptodate)
+			set_bit(BIO_UPTODATE, &cur->bi_flags);
+		bio_endio(cur, err);
+		cur = next;
+	}
+}
+
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+		return;
+
+	err = 0;
+
+	/* OK, we have read all the stripes we need to. */
+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+		err = -EIO;
+
+	rbio_orig_end_io(rbio, err, 0);
+	return;
+}
+
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else.  This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+				 int index, int pagenr, int bio_list_only)
+{
+	int chunk_page;
+	struct page *p = NULL;
+
+	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	p = rbio->bio_pages[chunk_page];
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	if (p || bio_list_only)
+		return p;
+
+	return rbio->stripe_pages[chunk_page];
+}
+
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+	unsigned long nr = stripe_len * nr_stripes;
+	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio.  Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+			  struct btrfs_bio *bbio, u64 *raid_map,
+			  u64 stripe_len)
+{
+	struct btrfs_raid_bio *rbio;
+	int nr_data = 0;
+	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+	void *p;
+
+	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+			GFP_NOFS);
+	if (!rbio) {
+		kfree(raid_map);
+		kfree(bbio);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	bio_list_init(&rbio->bio_list);
+	INIT_LIST_HEAD(&rbio->plug_list);
+	spin_lock_init(&rbio->bio_list_lock);
+	INIT_LIST_HEAD(&rbio->stripe_cache);
+	INIT_LIST_HEAD(&rbio->hash_list);
+	rbio->bbio = bbio;
+	rbio->raid_map = raid_map;
+	rbio->fs_info = root->fs_info;
+	rbio->stripe_len = stripe_len;
+	rbio->nr_pages = num_pages;
+	rbio->faila = -1;
+	rbio->failb = -1;
+	atomic_set(&rbio->refs, 1);
+
+	/*
+	 * the stripe_pages and bio_pages array point to the extra
+	 * memory we allocated past the end of the rbio
+	 */
+	p = rbio + 1;
+	rbio->stripe_pages = p;
+	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+
+	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+		nr_data = bbio->num_stripes - 2;
+	else
+		nr_data = bbio->num_stripes - 1;
+
+	rbio->nr_data = nr_data;
+	return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+		ClearPageUptodate(page);
+	}
+	return 0;
+}
+
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+
+	for (; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+	}
+	return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+			    struct bio_list *bio_list,
+			    struct page *page,
+			    int stripe_nr,
+			    unsigned long page_index,
+			    unsigned long bio_max_len)
+{
+	struct bio *last = bio_list->tail;
+	u64 last_end = 0;
+	int ret;
+	struct bio *bio;
+	struct btrfs_bio_stripe *stripe;
+	u64 disk_start;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+
+	/* if the device is missing, just fail this stripe */
+	if (!stripe->dev->bdev)
+		return fail_rbio_index(rbio, stripe_nr);
+
+	/* see if we can add this page onto our existing bio */
+	if (last) {
+		last_end = (u64)last->bi_iter.bi_sector << 9;
+		last_end += last->bi_iter.bi_size;
+
+		/*
+		 * we can't merge these if they are from different
+		 * devices or if they are not contiguous
+		 */
+		if (last_end == disk_start && stripe->dev->bdev &&
+		    test_bit(BIO_UPTODATE, &last->bi_flags) &&
+		    last->bi_bdev == stripe->dev->bdev) {
+			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+			if (ret == PAGE_CACHE_SIZE)
+				return 0;
+		}
+	}
+
+	/* put a new bio on the list */
+	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_size = 0;
+	bio->bi_bdev = stripe->dev->bdev;
+	bio->bi_iter.bi_sector = disk_start >> 9;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	bio_list_add(bio_list, bio);
+	return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk.  This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction.  The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+	if (rbio->faila >= 0 || rbio->failb >= 0) {
+		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+		__raid56_parity_recover(rbio);
+	} else {
+		finish_rmw(rbio);
+	}
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+	int index;
+	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+	index += page;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result.  This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	struct bio *bio;
+	u64 start;
+	unsigned long stripe_offset;
+	unsigned long page_index;
+	struct page *p;
+	int i;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	bio_list_for_each(bio, &rbio->bio_list) {
+		start = (u64)bio->bi_iter.bi_sector << 9;
+		stripe_offset = start - rbio->raid_map[0];
+		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			p = bio->bi_io_vec[i].bv_page;
+			rbio->bio_pages[page_index + i] = p;
+		}
+	}
+	spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/*
+ * this is called from one of two situations.  We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_bio *bbio = rbio->bbio;
+	void *pointers[bbio->num_stripes];
+	int stripe_len = rbio->stripe_len;
+	int nr_data = rbio->nr_data;
+	int stripe;
+	int pagenr;
+	int p_stripe = -1;
+	int q_stripe = -1;
+	struct bio_list bio_list;
+	struct bio *bio;
+	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	bio_list_init(&bio_list);
+
+	if (bbio->num_stripes - rbio->nr_data == 1) {
+		p_stripe = bbio->num_stripes - 1;
+	} else if (bbio->num_stripes - rbio->nr_data == 2) {
+		p_stripe = bbio->num_stripes - 2;
+		q_stripe = bbio->num_stripes - 1;
+	} else {
+		BUG();
+	}
+
+	/* at this point we either have a full stripe,
+	 * or we've read the full stripe from the drive.
+	 * recalculate the parity and write the new results.
+	 *
+	 * We're not allowed to add any new bios to the
+	 * bio list here, anyone else that wants to
+	 * change this stripe needs to do their own rmw.
+	 */
+	spin_lock_irq(&rbio->bio_list_lock);
+	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	atomic_set(&rbio->bbio->error, 0);
+
+	/*
+	 * now that we've set rmw_locked, run through the
+	 * bio list one last time and map the page pointers
+	 *
+	 * We don't cache full rbios because we're assuming
+	 * the higher layers are unlikely to use this area of
+	 * the disk again soon.  If they do use it again,
+	 * hopefully they will send another full bio.
+	 */
+	index_rbio_pages(rbio);
+	if (!rbio_is_full(rbio))
+		cache_rbio_pages(rbio);
+	else
+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		struct page *p;
+		/* first collect one page from each data stripe */
+		for (stripe = 0; stripe < nr_data; stripe++) {
+			p = page_in_rbio(rbio, stripe, pagenr, 0);
+			pointers[stripe] = kmap(p);
+		}
+
+		/* then add the parity stripe */
+		p = rbio_pstripe_page(rbio, pagenr);
+		SetPageUptodate(p);
+		pointers[stripe++] = kmap(p);
+
+		if (q_stripe != -1) {
+
+			/*
+			 * raid6, add the qstripe and call the
+			 * library function to fill in our p/q
+			 */
+			p = rbio_qstripe_page(rbio, pagenr);
+			SetPageUptodate(p);
+			pointers[stripe++] = kmap(p);
+
+			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+						pointers);
+		} else {
+			/* raid5 */
+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+		}
+
+
+		for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+	}
+
+	/*
+	 * time to start writing.  Make bios for everything from the
+	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+	 * everything else.
+	 */
+	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       page, stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
+	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_write_end_io;
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(WRITE, bio);
+	}
+	return;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * helper to find the stripe number for a given bio.  Used to figure out which
+ * stripe has failed.  This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+			   struct bio *bio)
+{
+	u64 physical = bio->bi_iter.bi_sector;
+	u64 stripe_start;
+	int i;
+	struct btrfs_bio_stripe *stripe;
+
+	physical <<= 9;
+
+	for (i = 0; i < rbio->bbio->num_stripes; i++) {
+		stripe = &rbio->bbio->stripes[i];
+		stripe_start = stripe->physical;
+		if (physical >= stripe_start &&
+		    physical < stripe_start + rbio->stripe_len) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping).  Used to figure out which stripe has
+ * failed.  This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+				   struct bio *bio)
+{
+	u64 logical = bio->bi_iter.bi_sector;
+	u64 stripe_start;
+	int i;
+
+	logical <<= 9;
+
+	for (i = 0; i < rbio->nr_data; i++) {
+		stripe_start = rbio->raid_map[i];
+		if (logical >= stripe_start &&
+		    logical < stripe_start + rbio->stripe_len) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+	/* we already know this stripe is bad, move on */
+	if (rbio->faila == failed || rbio->failb == failed)
+		goto out;
+
+	if (rbio->faila == -1) {
+		/* first failure on this rbio */
+		rbio->faila = failed;
+		atomic_inc(&rbio->bbio->error);
+	} else if (rbio->failb == -1) {
+		/* second failure on this rbio */
+		rbio->failb = failed;
+		atomic_inc(&rbio->bbio->error);
+	} else {
+		ret = -EIO;
+	}
+out:
+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+	return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+			   struct bio *bio)
+{
+	int failed = find_bio_stripe(rbio, bio);
+
+	if (failed < 0)
+		return -EIO;
+
+	return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * this sets each page in the bio uptodate.  It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+	int i;
+	struct page *p;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		p = bio->bi_io_vec[i].bv_page;
+		SetPageUptodate(p);
+	}
+}
+
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+		return;
+
+	err = 0;
+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+		goto cleanup;
+
+	/*
+	 * this will normally call finish_rmw to start our write
+	 * but if there are any failed stripes we'll reconstruct
+	 * from parity first
+	 */
+	validate_rbio_for_rmw(rbio);
+	return;
+
+cleanup:
+
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
+
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
+}
+
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+	btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
+
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
+}
+
+/*
+ * the stripe must be locked by the caller.  It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct btrfs_bio *bbio = rbio->bbio;
+	struct bio_list bio_list;
+	int ret;
+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	index_rbio_pages(rbio);
+
+	atomic_set(&rbio->bbio->error, 0);
+	/*
+	 * build a list of bios to read all the missing parts of this
+	 * stripe
+	 */
+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+			struct page *page;
+			/*
+			 * we want to find all the pages missing from
+			 * the rbio and read them from the disk.  If
+			 * page_in_rbio finds a page in the bio list
+			 * we don't need to read it off the stripe.
+			 */
+			page = page_in_rbio(rbio, stripe, pagenr, 1);
+			if (page)
+				continue;
+
+			page = rbio_stripe_page(rbio, stripe, pagenr);
+			/*
+			 * the bio cache may have handed us an uptodate
+			 * page.  If so, be happy and use it
+			 */
+			if (PageUptodate(page))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * this can happen if others have merged with
+		 * us, it means there is nothing left to read.
+		 * But if there are missing devices it may not be
+		 * safe to do the full stripe write yet.
+		 */
+		goto finish;
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&bbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_rmw_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+	/* the actual write will happen once the reads are done */
+	return 0;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+	return -EIO;
+
+finish:
+	validate_rbio_for_rmw(rbio);
+	return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = alloc_rbio_parity_pages(rbio);
+	if (ret) {
+		__free_raid_bio(rbio);
+		return ret;
+	}
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0)
+		finish_rmw(rbio);
+	return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0)
+		async_rmw_stripe(rbio);
+	return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe.  So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+	/* head off into rmw land if we don't have a full stripe */
+	if (!rbio_is_full(rbio))
+		return partial_stripe_write(rbio);
+	return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list.  When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+	struct blk_plug_cb cb;
+	struct btrfs_fs_info *info;
+	struct list_head rbio_list;
+	struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+						 plug_list);
+	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+						 plug_list);
+	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
+	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
+
+	if (a_sector < b_sector)
+		return -1;
+	if (a_sector > b_sector)
+		return 1;
+	return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+	struct btrfs_raid_bio *cur;
+	struct btrfs_raid_bio *last = NULL;
+
+	/*
+	 * sort our plug list then try to merge
+	 * everything we can in hopes of creating full
+	 * stripes.
+	 */
+	list_sort(NULL, &plug->rbio_list, plug_cmp);
+	while (!list_empty(&plug->rbio_list)) {
+		cur = list_entry(plug->rbio_list.next,
+				 struct btrfs_raid_bio, plug_list);
+		list_del_init(&cur->plug_list);
+
+		if (rbio_is_full(cur)) {
+			/* we have a full stripe, send it down */
+			full_stripe_write(cur);
+			continue;
+		}
+		if (last) {
+			if (rbio_can_merge(last, cur)) {
+				merge_rbio(last, cur);
+				__free_raid_bio(cur);
+				continue;
+
+			}
+			__raid56_parity_write(last);
+		}
+		last = cur;
+	}
+	if (last) {
+		__raid56_parity_write(last);
+	}
+	kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+	struct btrfs_plug_cb *plug;
+	plug = container_of(work, struct btrfs_plug_cb, work);
+	run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct btrfs_plug_cb *plug;
+	plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+	if (from_schedule) {
+		btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+		btrfs_queue_work(plug->info->rmw_workers,
+				 &plug->work);
+		return;
+	}
+	run_plug(plug);
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			struct btrfs_bio *bbio, u64 *raid_map,
+			u64 stripe_len)
+{
+	struct btrfs_raid_bio *rbio;
+	struct btrfs_plug_cb *plug = NULL;
+	struct blk_plug_cb *cb;
+
+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+	if (IS_ERR(rbio))
+		return PTR_ERR(rbio);
+	bio_list_add(&rbio->bio_list, bio);
+	rbio->bio_list_bytes = bio->bi_iter.bi_size;
+
+	/*
+	 * don't plug on full rbios, just get them out the door
+	 * as quickly as we can
+	 */
+	if (rbio_is_full(rbio))
+		return full_stripe_write(rbio);
+
+	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+			       sizeof(*plug));
+	if (cb) {
+		plug = container_of(cb, struct btrfs_plug_cb, cb);
+		if (!plug->info) {
+			plug->info = root->fs_info;
+			INIT_LIST_HEAD(&plug->rbio_list);
+		}
+		list_add_tail(&rbio->plug_list, &plug->rbio_list);
+	} else {
+		return __raid56_parity_write(rbio);
+	}
+	return 0;
+}
+
+/*
+ * all parity reconstruction happens here.  We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+	int pagenr, stripe;
+	void **pointers;
+	int faila = -1, failb = -1;
+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+	int i;
+
+	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+			   GFP_NOFS);
+	if (!pointers) {
+		err = -ENOMEM;
+		goto cleanup_io;
+	}
+
+	faila = rbio->faila;
+	failb = rbio->failb;
+
+	if (rbio->read_rebuild) {
+		spin_lock_irq(&rbio->bio_list_lock);
+		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+		spin_unlock_irq(&rbio->bio_list_lock);
+	}
+
+	index_rbio_pages(rbio);
+
+	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		/* setup our array of pointers with pages
+		 * from each stripe
+		 */
+		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+			/*
+			 * if we're rebuilding a read, we have to use
+			 * pages from the bio list
+			 */
+			if (rbio->read_rebuild &&
+			    (stripe == faila || stripe == failb)) {
+				page = page_in_rbio(rbio, stripe, pagenr, 0);
+			} else {
+				page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+			pointers[stripe] = kmap(page);
+		}
+
+		/* all raid6 handling here */
+		if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+		    RAID6_Q_STRIPE) {
+
+			/*
+			 * single failure, rebuild from parity raid5
+			 * style
+			 */
+			if (failb < 0) {
+				if (faila == rbio->nr_data) {
+					/*
+					 * Just the P stripe has failed, without
+					 * a bad data or Q stripe.
+					 * TODO, we should redo the xor here.
+					 */
+					err = -EIO;
+					goto cleanup;
+				}
+				/*
+				 * a single failure in raid6 is rebuilt
+				 * in the pstripe code below
+				 */
+				goto pstripe;
+			}
+
+			/* make sure our ps and qs are in order */
+			if (faila > failb) {
+				int tmp = failb;
+				failb = faila;
+				faila = tmp;
+			}
+
+			/* if the q stripe is failed, do a pstripe reconstruction
+			 * from the xors.
+			 * If both the q stripe and the P stripe are failed, we're
+			 * here due to a crc mismatch and we can't give them the
+			 * data they want
+			 */
+			if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
+				if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+					err = -EIO;
+					goto cleanup;
+				}
+				/*
+				 * otherwise we have one bad data stripe and
+				 * a good P stripe.  raid5!
+				 */
+				goto pstripe;
+			}
+
+			if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+				raid6_datap_recov(rbio->bbio->num_stripes,
+						  PAGE_SIZE, faila, pointers);
+			} else {
+				raid6_2data_recov(rbio->bbio->num_stripes,
+						  PAGE_SIZE, faila, failb,
+						  pointers);
+			}
+		} else {
+			void *p;
+
+			/* rebuild from P stripe here (raid5 or raid6) */
+			BUG_ON(failb != -1);
+pstripe:
+			/* Copy parity block into failed block to start with */
+			memcpy(pointers[faila],
+			       pointers[rbio->nr_data],
+			       PAGE_CACHE_SIZE);
+
+			/* rearrange the pointer array */
+			p = pointers[faila];
+			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+				pointers[stripe] = pointers[stripe + 1];
+			pointers[rbio->nr_data - 1] = p;
+
+			/* xor in the rest */
+			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+		}
+		/* if we're doing this rebuild as part of an rmw, go through
+		 * and set all of our private rbio pages in the
+		 * failed stripes as uptodate.  This way finish_rmw will
+		 * know they can be trusted.  If this was a read reconstruction,
+		 * other endio functions will fiddle the uptodate bits
+		 */
+		if (!rbio->read_rebuild) {
+			for (i = 0;  i < nr_pages; i++) {
+				if (faila != -1) {
+					page = rbio_stripe_page(rbio, faila, i);
+					SetPageUptodate(page);
+				}
+				if (failb != -1) {
+					page = rbio_stripe_page(rbio, failb, i);
+					SetPageUptodate(page);
+				}
+			}
+		}
+		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+			/*
+			 * if we're rebuilding a read, we have to use
+			 * pages from the bio list
+			 */
+			if (rbio->read_rebuild &&
+			    (stripe == faila || stripe == failb)) {
+				page = page_in_rbio(rbio, stripe, pagenr, 0);
+			} else {
+				page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+			kunmap(page);
+		}
+	}
+
+	err = 0;
+cleanup:
+	kfree(pointers);
+
+cleanup_io:
+
+	if (rbio->read_rebuild) {
+		if (err == 0)
+			cache_rbio_pages(rbio);
+		else
+			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+		rbio_orig_end_io(rbio, err, err == 0);
+	} else if (err == 0) {
+		rbio->faila = -1;
+		rbio->failb = -1;
+		finish_rmw(rbio);
+	} else {
+		rbio_orig_end_io(rbio, err, 0);
+	}
+}
+
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	/*
+	 * we only read stripe pages off the disk, set them
+	 * up to date if there were no errors
+	 */
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+		return;
+
+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+		rbio_orig_end_io(rbio, -EIO, 0);
+	else
+		__raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct btrfs_bio *bbio = rbio->bbio;
+	struct bio_list bio_list;
+	int ret;
+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	atomic_set(&rbio->bbio->error, 0);
+
+	/*
+	 * read everything that hasn't failed.  Thanks to the
+	 * stripe cache, it is possible that some or all of these
+	 * pages are going to be uptodate.
+	 */
+	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+		if (rbio->faila == stripe || rbio->failb == stripe) {
+			atomic_inc(&rbio->bbio->error);
+			continue;
+		}
+
+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+			struct page *p;
+
+			/*
+			 * the rmw code may have already read this
+			 * page in
+			 */
+			p = rbio_stripe_page(rbio, stripe, pagenr);
+			if (PageUptodate(p))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       rbio_stripe_page(rbio, stripe, pagenr),
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret < 0)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * we might have no bios to read just because the pages
+		 * were up to date, or we might have no bios to read because
+		 * the devices were gone.
+		 */
+		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+			__raid_recover_end_io(rbio);
+			goto out;
+		} else {
+			goto cleanup;
+		}
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&bbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_recover_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+out:
+	return 0;
+
+cleanup:
+	if (rbio->read_rebuild)
+		rbio_orig_end_io(rbio, -EIO, 0);
+	return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers.  This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+			  struct btrfs_bio *bbio, u64 *raid_map,
+			  u64 stripe_len, int mirror_num)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+	if (IS_ERR(rbio))
+		return PTR_ERR(rbio);
+
+	rbio->read_rebuild = 1;
+	bio_list_add(&rbio->bio_list, bio);
+	rbio->bio_list_bytes = bio->bi_iter.bi_size;
+
+	rbio->faila = find_logical_bio_stripe(rbio, bio);
+	if (rbio->faila == -1) {
+		BUG();
+		kfree(raid_map);
+		kfree(bbio);
+		kfree(rbio);
+		return -EIO;
+	}
+
+	/*
+	 * reconstruct from the q stripe if they are
+	 * asking for mirror 3
+	 */
+	if (mirror_num == 3)
+		rbio->failb = bbio->num_stripes - 2;
+
+	ret = lock_stripe_add(rbio);
+
+	/*
+	 * __raid56_parity_recover will end the bio with
+	 * any errors it hits.  We don't want to return
+	 * its error value up the stack because our caller
+	 * will end up calling bio_endio with any nonzero
+	 * return
+	 */
+	if (ret == 0)
+		__raid56_parity_recover(rbio);
+	/*
+	 * our rbio has been added to the list of
+	 * rbios that will be handled after the
+	 * currently lock owner is done
+	 */
+	return 0;
+
+}
+
+static void rmw_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	__raid56_parity_recover(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 00000000000..ea5d73bfdfb
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		return 1;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		return 2;
+	else
+		return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+	return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
+			     ((x) == RAID6_Q_STRIPE))
+
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+				 struct btrfs_bio *bbio, u64 *raid_map,
+				 u64 stripe_len, int mirror_num);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			       struct btrfs_bio *bbio, u64 *raid_map,
+			       u64 stripe_len);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
new file mode 100644
index 00000000000..9e111e4576d
--- /dev/null
+++ b/fs/btrfs/rcu-string.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct rcu_string {
+	struct rcu_head rcu;
+	char str[0];
+};
+
+static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
+{
+	size_t len = strlen(src) + 1;
+	struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
+					 (len * sizeof(char)), mask);
+	if (!ret)
+		return ret;
+	strncpy(ret->str, src, len);
+	return ret;
+}
+
+static inline void rcu_string_free(struct rcu_string *str)
+{
+	if (str)
+		kfree_rcu(str, rcu);
+}
+
+#define printk_in_rcu(fmt, ...) do {	\
+	rcu_read_lock();		\
+	printk(fmt, __VA_ARGS__);	\
+	rcu_read_unlock();		\
+} while (0)
+
+#define printk_ratelimited_in_rcu(fmt, ...) do {	\
+	rcu_read_lock();				\
+	printk_ratelimited(fmt, __VA_ARGS__);		\
+	rcu_read_unlock();				\
+} while (0)
+
+#define rcu_str_deref(rcu_str) ({				\
+	struct rcu_string *__str = rcu_dereference(rcu_str);	\
+	__str->str;						\
+})
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 00000000000..09230cf3a24
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,994 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "dev-replace.h"
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+
+#define MAX_IN_FLIGHT 6
+
+struct reada_extctl {
+	struct list_head	list;
+	struct reada_control	*rc;
+	u64			generation;
+};
+
+struct reada_extent {
+	u64			logical;
+	struct btrfs_key	top;
+	u32			blocksize;
+	int			err;
+	struct list_head	extctl;
+	int 			refcnt;
+	spinlock_t		lock;
+	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
+	int			nzones;
+	struct btrfs_device	*scheduled_for;
+};
+
+struct reada_zone {
+	u64			start;
+	u64			end;
+	u64			elems;
+	struct list_head	list;
+	spinlock_t		lock;
+	int			locked;
+	struct btrfs_device	*device;
+	struct btrfs_device	*devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+							   * self */
+	int			ndevs;
+	struct kref		refcnt;
+};
+
+struct reada_machine_work {
+	struct btrfs_work	work;
+	struct btrfs_fs_info	*fs_info;
+};
+
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation);
+
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			    u64 start, int err)
+{
+	int level = 0;
+	int nritems;
+	int i;
+	u64 bytenr;
+	u64 generation;
+	struct reada_extent *re;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head list;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct btrfs_device *for_dev;
+
+	if (eb)
+		level = btrfs_header_level(eb);
+
+	/* find extent */
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		re->refcnt++;
+	spin_unlock(&fs_info->reada_lock);
+
+	if (!re)
+		return -1;
+
+	spin_lock(&re->lock);
+	/*
+	 * just take the full list from the extent. afterwards we
+	 * don't need the lock anymore
+	 */
+	list_replace_init(&re->extctl, &list);
+	for_dev = re->scheduled_for;
+	re->scheduled_for = NULL;
+	spin_unlock(&re->lock);
+
+	if (err == 0) {
+		nritems = level ? btrfs_header_nritems(eb) : 0;
+		generation = btrfs_header_generation(eb);
+		/*
+		 * FIXME: currently we just set nritems to 0 if this is a leaf,
+		 * effectively ignoring the content. In a next step we could
+		 * trigger more readahead depending from the content, e.g.
+		 * fetch the checksums for the extents in the leaf.
+		 */
+	} else {
+		/*
+		 * this is the error case, the extent buffer has not been
+		 * read correctly. We won't access anything from it and
+		 * just cleanup our data structures. Effectively this will
+		 * cut the branch below this node from read ahead.
+		 */
+		nritems = 0;
+		generation = 0;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		struct reada_extctl *rec;
+		u64 n_gen;
+		struct btrfs_key key;
+		struct btrfs_key next_key;
+
+		btrfs_node_key_to_cpu(eb, &key, i);
+		if (i + 1 < nritems)
+			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+		else
+			next_key = re->top;
+		bytenr = btrfs_node_blockptr(eb, i);
+		n_gen = btrfs_node_ptr_generation(eb, i);
+
+		list_for_each_entry(rec, &list, list) {
+			struct reada_control *rc = rec->rc;
+
+			/*
+			 * if the generation doesn't match, just ignore this
+			 * extctl. This will probably cut off a branch from
+			 * prefetch. Alternatively one could start a new (sub-)
+			 * prefetch for this branch, starting again from root.
+			 * FIXME: move the generation check out of this loop
+			 */
+#ifdef DEBUG
+			if (rec->generation != generation) {
+				btrfs_debug(root->fs_info,
+					   "generation mismatch for (%llu,%d,%llu) %llu != %llu",
+				       key.objectid, key.type, key.offset,
+				       rec->generation, generation);
+			}
+#endif
+			if (rec->generation == generation &&
+			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+				reada_add_block(rc, bytenr, &next_key,
+						level - 1, n_gen);
+		}
+	}
+	/*
+	 * free extctl records
+	 */
+	while (!list_empty(&list)) {
+		struct reada_control *rc;
+		struct reada_extctl *rec;
+
+		rec = list_first_entry(&list, struct reada_extctl, list);
+		list_del(&rec->list);
+		rc = rec->rc;
+		kfree(rec);
+
+		kref_get(&rc->refcnt);
+		if (atomic_dec_and_test(&rc->elems)) {
+			kref_put(&rc->refcnt, reada_control_release);
+			wake_up(&rc->wait);
+		}
+		kref_put(&rc->refcnt, reada_control_release);
+
+		reada_extent_put(fs_info, re);	/* one ref for each entry */
+	}
+	reada_extent_put(fs_info, re);	/* our ref */
+	if (for_dev)
+		atomic_dec(&for_dev->reada_in_flight);
+
+	return 0;
+}
+
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err)
+{
+	int ret;
+
+	ret = __readahead_hook(root, eb, start, err);
+
+	reada_start_machine(root->fs_info);
+
+	return ret;
+}
+
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+					  struct btrfs_device *dev, u64 logical,
+					  struct btrfs_bio *bbio)
+{
+	int ret;
+	struct reada_zone *zone;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 start;
+	u64 end;
+	int i;
+
+	zone = NULL;
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+				     logical >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 1)
+		kref_get(&zone->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (ret == 1) {
+		if (logical >= zone->start && logical < zone->end)
+			return zone;
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+
+	cache = btrfs_lookup_block_group(fs_info, logical);
+	if (!cache)
+		return NULL;
+
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+	btrfs_put_block_group(cache);
+
+	zone = kzalloc(sizeof(*zone), GFP_NOFS);
+	if (!zone)
+		return NULL;
+
+	zone->start = start;
+	zone->end = end;
+	INIT_LIST_HEAD(&zone->list);
+	spin_lock_init(&zone->lock);
+	zone->locked = 0;
+	kref_init(&zone->refcnt);
+	zone->elems = 0;
+	zone->device = dev; /* our device always sits at index 0 */
+	for (i = 0; i < bbio->num_stripes; ++i) {
+		/* bounds have already been checked */
+		zone->devs[i] = bbio->stripes[i].dev;
+	}
+	zone->ndevs = bbio->num_stripes;
+
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&dev->reada_zones,
+				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
+				zone);
+
+	if (ret == -EEXIST) {
+		kfree(zone);
+		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+					     logical >> PAGE_CACHE_SHIFT, 1);
+		if (ret == 1)
+			kref_get(&zone->refcnt);
+	}
+	spin_unlock(&fs_info->reada_lock);
+
+	return zone;
+}
+
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+					      u64 logical,
+					      struct btrfs_key *top, int level)
+{
+	int ret;
+	struct reada_extent *re = NULL;
+	struct reada_extent *re_exist = NULL;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_bio *bbio = NULL;
+	struct btrfs_device *dev;
+	struct btrfs_device *prev_dev;
+	u32 blocksize;
+	u64 length;
+	int nzones = 0;
+	int i;
+	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+	int dev_replace_is_ongoing;
+
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		re->refcnt++;
+	spin_unlock(&fs_info->reada_lock);
+
+	if (re)
+		return re;
+
+	re = kzalloc(sizeof(*re), GFP_NOFS);
+	if (!re)
+		return NULL;
+
+	blocksize = btrfs_level_size(root, level);
+	re->logical = logical;
+	re->blocksize = blocksize;
+	re->top = *top;
+	INIT_LIST_HEAD(&re->extctl);
+	spin_lock_init(&re->lock);
+	re->refcnt = 1;
+
+	/*
+	 * map block
+	 */
+	length = blocksize;
+	ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+			      &bbio, 0);
+	if (ret || !bbio || length < blocksize)
+		goto error;
+
+	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+		btrfs_err(root->fs_info,
+			   "readahead: more than %d copies not supported",
+			   BTRFS_MAX_MIRRORS);
+		goto error;
+	}
+
+	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+		struct reada_zone *zone;
+
+		dev = bbio->stripes[nzones].dev;
+		zone = reada_find_zone(fs_info, dev, logical, bbio);
+		if (!zone)
+			break;
+
+		re->zones[nzones] = zone;
+		spin_lock(&zone->lock);
+		if (!zone->elems)
+			kref_get(&zone->refcnt);
+		++zone->elems;
+		spin_unlock(&zone->lock);
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	re->nzones = nzones;
+	if (nzones == 0) {
+		/* not a single zone found, error and out */
+		goto error;
+	}
+
+	/* insert extent in reada_tree + all per-device trees, all or nothing */
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret == -EEXIST) {
+		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
+		BUG_ON(!re_exist);
+		re_exist->refcnt++;
+		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		goto error;
+	}
+	if (ret) {
+		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		goto error;
+	}
+	prev_dev = NULL;
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+			&fs_info->dev_replace);
+	for (i = 0; i < nzones; ++i) {
+		dev = bbio->stripes[i].dev;
+		if (dev == prev_dev) {
+			/*
+			 * in case of DUP, just add the first zone. As both
+			 * are on the same device, there's nothing to gain
+			 * from adding both.
+			 * Also, it wouldn't work, as the tree is per device
+			 * and adding would fail with EEXIST
+			 */
+			continue;
+		}
+		if (!dev->bdev) {
+			/*
+			 * cannot read ahead on missing device, but for RAID5/6,
+			 * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+			 * device for such case.
+			 */
+			if (nzones > 1)
+				continue;
+		}
+		if (dev_replace_is_ongoing &&
+		    dev == fs_info->dev_replace.tgtdev) {
+			/*
+			 * as this device is selected for reading only as
+			 * a last resort, skip it for read ahead.
+			 */
+			continue;
+		}
+		prev_dev = dev;
+		ret = radix_tree_insert(&dev->reada_extents, index, re);
+		if (ret) {
+			while (--i >= 0) {
+				dev = bbio->stripes[i].dev;
+				BUG_ON(dev == NULL);
+				/* ignore whether the entry was inserted */
+				radix_tree_delete(&dev->reada_extents, index);
+			}
+			BUG_ON(fs_info == NULL);
+			radix_tree_delete(&fs_info->reada_tree, index);
+			spin_unlock(&fs_info->reada_lock);
+			btrfs_dev_replace_unlock(&fs_info->dev_replace);
+			goto error;
+		}
+	}
+	spin_unlock(&fs_info->reada_lock);
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	kfree(bbio);
+	return re;
+
+error:
+	while (nzones) {
+		struct reada_zone *zone;
+
+		--nzones;
+		zone = re->zones[nzones];
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/*
+			 * no fs_info->reada_lock needed, as this can't be
+			 * the last ref
+			 */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	kfree(bbio);
+	kfree(re);
+	return re_exist;
+}
+
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+			     struct reada_extent *re)
+{
+	int i;
+	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+
+	spin_lock(&fs_info->reada_lock);
+	if (--re->refcnt) {
+		spin_unlock(&fs_info->reada_lock);
+		return;
+	}
+
+	radix_tree_delete(&fs_info->reada_tree, index);
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		radix_tree_delete(&zone->device->reada_extents, index);
+	}
+
+	spin_unlock(&fs_info->reada_lock);
+
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/* no fs_info->reada_lock needed, as this can't be
+			 * the last ref */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	if (re->scheduled_for)
+		atomic_dec(&re->scheduled_for->reada_in_flight);
+
+	kfree(re);
+}
+
+static void reada_zone_release(struct kref *kref)
+{
+	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+
+	radix_tree_delete(&zone->device->reada_zones,
+			  zone->end >> PAGE_CACHE_SHIFT);
+
+	kfree(zone);
+}
+
+static void reada_control_release(struct kref *kref)
+{
+	struct reada_control *rc = container_of(kref, struct reada_control,
+						refcnt);
+
+	kfree(rc);
+}
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation)
+{
+	struct btrfs_root *root = rc->root;
+	struct reada_extent *re;
+	struct reada_extctl *rec;
+
+	re = reada_find_extent(root, logical, top, level); /* takes one ref */
+	if (!re)
+		return -1;
+
+	rec = kzalloc(sizeof(*rec), GFP_NOFS);
+	if (!rec) {
+		reada_extent_put(root->fs_info, re);
+		return -1;
+	}
+
+	rec->rc = rc;
+	rec->generation = generation;
+	atomic_inc(&rc->elems);
+
+	spin_lock(&re->lock);
+	list_add_tail(&rec->list, &re->extctl);
+	spin_unlock(&re->lock);
+
+	/* leave the ref on the extent */
+
+	return 0;
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+	int i;
+	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < zone->ndevs; ++i) {
+		struct reada_zone *peer;
+		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+		if (peer && peer->device != zone->device)
+			peer->locked = lock;
+	}
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+	struct reada_zone *top_zone = NULL;
+	struct reada_zone *top_locked_zone = NULL;
+	u64 top_elems = 0;
+	u64 top_locked_elems = 0;
+	unsigned long index = 0;
+	int ret;
+
+	if (dev->reada_curr_zone) {
+		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+		dev->reada_curr_zone = NULL;
+	}
+	/* pick the zone with the most elements */
+	while (1) {
+		struct reada_zone *zone;
+
+		ret = radix_tree_gang_lookup(&dev->reada_zones,
+					     (void **)&zone, index, 1);
+		if (ret == 0)
+			break;
+		index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		if (zone->locked) {
+			if (zone->elems > top_locked_elems) {
+				top_locked_elems = zone->elems;
+				top_locked_zone = zone;
+			}
+		} else {
+			if (zone->elems > top_elems) {
+				top_elems = zone->elems;
+				top_zone = zone;
+			}
+		}
+	}
+	if (top_zone)
+		dev->reada_curr_zone = top_zone;
+	else if (top_locked_zone)
+		dev->reada_curr_zone = top_locked_zone;
+	else
+		return 0;
+
+	dev->reada_next = dev->reada_curr_zone->start;
+	kref_get(&dev->reada_curr_zone->refcnt);
+	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+
+	return 1;
+}
+
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+				   struct btrfs_device *dev)
+{
+	struct reada_extent *re = NULL;
+	int mirror_num = 0;
+	struct extent_buffer *eb = NULL;
+	u64 logical;
+	u32 blocksize;
+	int ret;
+	int i;
+	int need_kick = 0;
+
+	spin_lock(&fs_info->reada_lock);
+	if (dev->reada_curr_zone == NULL) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+	}
+	/*
+	 * FIXME currently we issue the reads one extent at a time. If we have
+	 * a contiguous block of extents, we could also coagulate them or use
+	 * plugging to speed things up
+	 */
+	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+		re = NULL;
+		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+					dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	}
+	if (ret == 0) {
+		spin_unlock(&fs_info->reada_lock);
+		return 0;
+	}
+	dev->reada_next = re->logical + re->blocksize;
+	re->refcnt++;
+
+	spin_unlock(&fs_info->reada_lock);
+
+	/*
+	 * find mirror num
+	 */
+	for (i = 0; i < re->nzones; ++i) {
+		if (re->zones[i]->device == dev) {
+			mirror_num = i + 1;
+			break;
+		}
+	}
+	logical = re->logical;
+	blocksize = re->blocksize;
+
+	spin_lock(&re->lock);
+	if (re->scheduled_for == NULL) {
+		re->scheduled_for = dev;
+		need_kick = 1;
+	}
+	spin_unlock(&re->lock);
+
+	reada_extent_put(fs_info, re);
+
+	if (!need_kick)
+		return 0;
+
+	atomic_inc(&dev->reada_in_flight);
+	ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+			 mirror_num, &eb);
+	if (ret)
+		__readahead_hook(fs_info->extent_root, NULL, logical, ret);
+	else if (eb)
+		__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+
+	if (eb)
+		free_extent_buffer(eb);
+
+	return 1;
+
+}
+
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+	struct reada_machine_work *rmw;
+	struct btrfs_fs_info *fs_info;
+	int old_ioprio;
+
+	rmw = container_of(work, struct reada_machine_work, work);
+	fs_info = rmw->fs_info;
+
+	kfree(rmw);
+
+	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+				       task_nice_ioprio(current));
+	set_task_ioprio(current, BTRFS_IOPRIO_READA);
+	__reada_start_machine(fs_info);
+	set_task_ioprio(current, old_ioprio);
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	u64 enqueued;
+	u64 total = 0;
+	int i;
+
+	do {
+		enqueued = 0;
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (atomic_read(&device->reada_in_flight) <
+			    MAX_IN_FLIGHT)
+				enqueued += reada_start_machine_dev(fs_info,
+								    device);
+		}
+		total += enqueued;
+	} while (enqueued && total < 10000);
+
+	if (enqueued == 0)
+		return;
+
+	/*
+	 * If everything is already in the cache, this is effectively single
+	 * threaded. To a) not hold the caller for too long and b) to utilize
+	 * more cores, we broke the loop above after 10000 iterations and now
+	 * enqueue to workers to finish it. This will distribute the load to
+	 * the cores.
+	 */
+	for (i = 0; i < 2; ++i)
+		reada_start_machine(fs_info);
+}
+
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct reada_machine_work *rmw;
+
+	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+	if (!rmw) {
+		/* FIXME we cannot handle this properly right now */
+		BUG();
+	}
+	btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
+	rmw->fs_info = fs_info;
+
+	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
+}
+
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	unsigned long index;
+	int ret;
+	int i;
+	int j;
+	int cnt;
+
+	spin_lock(&fs_info->reada_lock);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+			atomic_read(&device->reada_in_flight));
+		index = 0;
+		while (1) {
+			struct reada_zone *zone;
+			ret = radix_tree_gang_lookup(&device->reada_zones,
+						     (void **)&zone, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+				"%d devs", zone->start, zone->end, zone->elems,
+				zone->locked);
+			for (j = 0; j < zone->ndevs; ++j) {
+				printk(KERN_CONT " %lld",
+					zone->devs[j]->devid);
+			}
+			if (device->reada_curr_zone == zone)
+				printk(KERN_CONT " curr off %llu",
+					device->reada_next - zone->start);
+			printk(KERN_CONT "\n");
+			index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		}
+		cnt = 0;
+		index = 0;
+		while (all) {
+			struct reada_extent *re = NULL;
+
+			ret = radix_tree_gang_lookup(&device->reada_extents,
+						     (void **)&re, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG
+				"  re: logical %llu size %u empty %d for %lld",
+				re->logical, re->blocksize,
+				list_empty(&re->extctl), re->scheduled_for ?
+				re->scheduled_for->devid : -1);
+
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+			printk(KERN_CONT "\n");
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			if (++cnt > 15)
+				break;
+		}
+	}
+
+	index = 0;
+	cnt = 0;
+	while (all) {
+		struct reada_extent *re = NULL;
+
+		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+					     index, 1);
+		if (ret == 0)
+			break;
+		if (!re->scheduled_for) {
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			continue;
+		}
+		printk(KERN_DEBUG
+			"re: logical %llu size %u list empty %d for %lld",
+			re->logical, re->blocksize, list_empty(&re->extctl),
+			re->scheduled_for ? re->scheduled_for->devid : -1);
+		for (i = 0; i < re->nzones; ++i) {
+			printk(KERN_CONT " zone %llu-%llu devs",
+				re->zones[i]->start,
+				re->zones[i]->end);
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+		}
+		printk(KERN_CONT "\n");
+		index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+	}
+	spin_unlock(&fs_info->reada_lock);
+}
+#endif
+
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+	struct reada_control *rc;
+	u64 start;
+	u64 generation;
+	int level;
+	struct extent_buffer *node;
+	static struct btrfs_key max_key = {
+		.objectid = (u64)-1,
+		.type = (u8)-1,
+		.offset = (u64)-1
+	};
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return ERR_PTR(-ENOMEM);
+
+	rc->root = root;
+	rc->key_start = *key_start;
+	rc->key_end = *key_end;
+	atomic_set(&rc->elems, 0);
+	init_waitqueue_head(&rc->wait);
+	kref_init(&rc->refcnt);
+	kref_get(&rc->refcnt); /* one ref for having elements */
+
+	node = btrfs_root_node(root);
+	start = node->start;
+	level = btrfs_header_level(node);
+	generation = btrfs_header_generation(node);
+	free_extent_buffer(node);
+
+	if (reada_add_block(rc, start, &max_key, level, generation)) {
+		kfree(rc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	reada_start_machine(root->fs_info);
+
+	return rc;
+}
+
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+				   5 * HZ);
+		dump_devs(rc->root->fs_info,
+			  atomic_read(&rc->elems) < 10 ? 1 : 0);
+	}
+
+	dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+	}
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#endif
+
+void btrfs_reada_detach(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	kref_put(&rc->refcnt, reada_control_release);
+}
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
deleted file mode 100644
index a97314cf6bd..00000000000
--- a/fs/btrfs/ref-cache.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sort.h>
-#include "ctree.h"
-#include "ref-cache.h"
-#include "transaction.h"
-
-/*
- * leaf refs are used to cache the information about which extents
- * a given leaf has references on.  This allows us to process that leaf
- * in btrfs_drop_snapshot without needing to read it back from disk.
- */
-
-/*
- * kmalloc a leaf reference struct and update the counters for the
- * total ref cache size
- */
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
-					    int nr_extents)
-{
-	struct btrfs_leaf_ref *ref;
-	size_t size = btrfs_leaf_ref_size(nr_extents);
-
-	ref = kmalloc(size, GFP_NOFS);
-	if (ref) {
-		spin_lock(&root->fs_info->ref_cache_lock);
-		root->fs_info->total_ref_cache_size += size;
-		spin_unlock(&root->fs_info->ref_cache_lock);
-
-		memset(ref, 0, sizeof(*ref));
-		atomic_set(&ref->usage, 1);
-		INIT_LIST_HEAD(&ref->list);
-	}
-	return ref;
-}
-
-/*
- * free a leaf reference struct and update the counters for the
- * total ref cache size
- */
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
-	if (!ref)
-		return;
-	WARN_ON(atomic_read(&ref->usage) == 0);
-	if (atomic_dec_and_test(&ref->usage)) {
-		size_t size = btrfs_leaf_ref_size(ref->nritems);
-
-		BUG_ON(ref->in_tree);
-		kfree(ref);
-
-		spin_lock(&root->fs_info->ref_cache_lock);
-		root->fs_info->total_ref_cache_size -= size;
-		spin_unlock(&root->fs_info->ref_cache_lock);
-	}
-}
-
-static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
-				   struct rb_node *node)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct btrfs_leaf_ref *entry;
-
-	while (*p) {
-		parent = *p;
-		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
-
-		if (bytenr < entry->bytenr)
-			p = &(*p)->rb_left;
-		else if (bytenr > entry->bytenr)
-			p = &(*p)->rb_right;
-		else
-			return parent;
-	}
-
-	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
-static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
-{
-	struct rb_node *n = root->rb_node;
-	struct btrfs_leaf_ref *entry;
-
-	while (n) {
-		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
-		WARN_ON(!entry->in_tree);
-
-		if (bytenr < entry->bytenr)
-			n = n->rb_left;
-		else if (bytenr > entry->bytenr)
-			n = n->rb_right;
-		else
-			return n;
-	}
-	return NULL;
-}
-
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
-			   int shared)
-{
-	struct btrfs_leaf_ref *ref = NULL;
-	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-
-	if (shared)
-		tree = &root->fs_info->shared_ref_tree;
-	if (!tree)
-		return 0;
-
-	spin_lock(&tree->lock);
-	while (!list_empty(&tree->list)) {
-		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
-		BUG_ON(ref->tree != tree);
-		if (ref->root_gen > max_root_gen)
-			break;
-		if (!xchg(&ref->in_tree, 0)) {
-			cond_resched_lock(&tree->lock);
-			continue;
-		}
-
-		rb_erase(&ref->rb_node, &tree->root);
-		list_del_init(&ref->list);
-
-		spin_unlock(&tree->lock);
-		btrfs_free_leaf_ref(root, ref);
-		cond_resched();
-		spin_lock(&tree->lock);
-	}
-	spin_unlock(&tree->lock);
-	return 0;
-}
-
-/*
- * find the leaf ref for a given extent.  This returns the ref struct with
- * a usage reference incremented
- */
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-					     u64 bytenr)
-{
-	struct rb_node *rb;
-	struct btrfs_leaf_ref *ref = NULL;
-	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-again:
-	if (tree) {
-		spin_lock(&tree->lock);
-		rb = tree_search(&tree->root, bytenr);
-		if (rb)
-			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
-		if (ref)
-			atomic_inc(&ref->usage);
-		spin_unlock(&tree->lock);
-		if (ref)
-			return ref;
-	}
-	if (tree != &root->fs_info->shared_ref_tree) {
-		tree = &root->fs_info->shared_ref_tree;
-		goto again;
-	}
-	return NULL;
-}
-
-/*
- * add a fully filled in leaf ref struct
- * remove all the refs older than a given root generation
- */
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
-		       int shared)
-{
-	int ret = 0;
-	struct rb_node *rb;
-	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-
-	if (shared)
-		tree = &root->fs_info->shared_ref_tree;
-
-	spin_lock(&tree->lock);
-	rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
-	if (rb) {
-		ret = -EEXIST;
-	} else {
-		atomic_inc(&ref->usage);
-		ref->tree = tree;
-		ref->in_tree = 1;
-		list_add_tail(&ref->list, &tree->list);
-	}
-	spin_unlock(&tree->lock);
-	return ret;
-}
-
-/*
- * remove a single leaf ref from the tree.  This drops the ref held by the tree
- * only
- */
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
-	struct btrfs_leaf_ref_tree *tree;
-
-	if (!xchg(&ref->in_tree, 0))
-		return 0;
-
-	tree = ref->tree;
-	spin_lock(&tree->lock);
-
-	rb_erase(&ref->rb_node, &tree->root);
-	list_del_init(&ref->list);
-
-	spin_unlock(&tree->lock);
-
-	btrfs_free_leaf_ref(root, ref);
-	return 0;
-}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
deleted file mode 100644
index e2a55cb2072..00000000000
--- a/fs/btrfs/ref-cache.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __REFCACHE__
-#define __REFCACHE__
-
-struct btrfs_extent_info {
-	/* bytenr and num_bytes find the extent in the extent allocation tree */
-	u64 bytenr;
-	u64 num_bytes;
-
-	/* objectid and offset find the back reference for the file */
-	u64 objectid;
-	u64 offset;
-};
-
-struct btrfs_leaf_ref {
-	struct rb_node rb_node;
-	struct btrfs_leaf_ref_tree *tree;
-	int in_tree;
-	atomic_t usage;
-
-	u64 root_gen;
-	u64 bytenr;
-	u64 owner;
-	u64 generation;
-	int nritems;
-
-	struct list_head list;
-	struct btrfs_extent_info extents[];
-};
-
-static inline size_t btrfs_leaf_ref_size(int nr_extents)
-{
-	return sizeof(struct btrfs_leaf_ref) +
-	       sizeof(struct btrfs_extent_info) * nr_extents;
-}
-
-static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
-{
-	tree->root = RB_ROOT;
-	INIT_LIST_HEAD(&tree->list);
-	spin_lock_init(&tree->lock);
-}
-
-static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
-{
-	return RB_EMPTY_ROOT(&tree->root);
-}
-
-void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
-					    int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-					     u64 bytenr);
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
-		       int shared);
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
-			   int shared);
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 31ade5802ae..65245a07275 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -30,6 +30,7 @@
 #include "btrfs_inode.h"
 #include "async-thread.h"
 #include "free-space-cache.h"
+#include "inode-map.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -93,6 +94,7 @@ struct backref_edge {
 
 #define LOWER	0
 #define UPPER	1
+#define RELOCATION_RESERVED_NODES	256
 
 struct backref_cache {
 	/* red black tree of all backref nodes in the cache */
@@ -175,6 +177,8 @@ struct reloc_control {
 	u64 merging_rsv_size;
 	/* size of relocated tree nodes */
 	u64 nodes_relocated;
+	/* reserved size for block group relocation*/
+	u64 reserved_bytes;
 
 	u64 search_start;
 	u64 extents_found;
@@ -183,7 +187,6 @@ struct reloc_control {
 	unsigned int create_reloc_tree:1;
 	unsigned int merge_reloc_tree:1;
 	unsigned int found_file_extent:1;
-	unsigned int commit_transaction:1;
 };
 
 /* stages of data relocation */
@@ -325,6 +328,18 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 	return NULL;
 }
 
+static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
+{
+
+	struct btrfs_fs_info *fs_info = NULL;
+	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
+					      rb_node);
+	if (bnode->root)
+		fs_info = bnode->root->fs_info;
+	btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
+		    "found at offset %llu", bytenr);
+}
+
 /*
  * walk up backref nodes until reach node presents tree root
  */
@@ -451,7 +466,8 @@ static void update_backref_node(struct backref_cache *cache,
 	rb_erase(&node->rb_node, &cache->rb_root);
 	node->bytenr = bytenr;
 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
-	BUG_ON(rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, bytenr);
 }
 
 /*
@@ -507,11 +523,12 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
 	return 1;
 }
 
+
 static int should_ignore_root(struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
 
-	if (!root->ref_cows)
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		return 0;
 
 	reloc_root = root->reloc_root;
@@ -529,7 +546,6 @@ static int should_ignore_root(struct btrfs_root *root)
 	 */
 	return 1;
 }
-
 /*
  * find reloc tree by address of tree root
  */
@@ -557,7 +573,9 @@ static int is_cowonly_root(u64 root_objectid)
 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
+	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
+	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
 		return 1;
 	return 0;
 }
@@ -574,7 +592,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 	else
 		key.offset = (u64)-1;
 
-	return btrfs_read_fs_root_no_name(fs_info, &key);
+	return btrfs_get_fs_root(fs_info, &key, false);
 }
 
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -592,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,
 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 	BUG_ON(IS_ERR(root));
 
-	if (root->ref_cows &&
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 	    generation != btrfs_root_generation(&root->root_item))
 		return NULL;
 
@@ -604,10 +622,13 @@ static noinline_for_stack
 int find_inline_backref(struct extent_buffer *leaf, int slot,
 			unsigned long *ptr, unsigned long *end)
 {
+	struct btrfs_key key;
 	struct btrfs_extent_item *ei;
 	struct btrfs_tree_block_info *bi;
 	u32 item_size;
 
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+
 	item_size = btrfs_item_size_nr(leaf, slot);
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 	if (item_size < sizeof(*ei)) {
@@ -619,13 +640,23 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 
-	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    item_size <= sizeof(*ei) + sizeof(*bi)) {
 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		return 1;
 	}
+	if (key.type == BTRFS_METADATA_ITEM_KEY &&
+	    item_size <= sizeof(*ei)) {
+		WARN_ON(item_size < sizeof(*ei));
+		return 1;
+	}
 
-	bi = (struct btrfs_tree_block_info *)(ei + 1);
-	*ptr = (unsigned long)(bi + 1);
+	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		*ptr = (unsigned long)(bi + 1);
+	} else {
+		*ptr = (unsigned long)(ei + 1);
+	}
 	*end = (unsigned long)ei + item_size;
 	return 0;
 }
@@ -669,6 +700,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
 	int cowonly;
 	int ret;
 	int err = 0;
+	bool need_check = true;
 
 	path1 = btrfs_alloc_path();
 	path2 = btrfs_alloc_path();
@@ -676,6 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
 		err = -ENOMEM;
 		goto out;
 	}
+	path1->reada = 1;
+	path2->reada = 2;
 
 	node = alloc_backref_node(cache);
 	if (!node) {
@@ -691,7 +725,7 @@ again:
 	end = 0;
 	ptr = 0;
 	key.objectid = cur->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.type = BTRFS_METADATA_ITEM_KEY;
 	key.offset = (u64)-1;
 
 	path1->search_commit_root = 1;
@@ -709,7 +743,7 @@ again:
 	WARN_ON(cur->checked);
 	if (!list_empty(&cur->upper)) {
 		/*
-		 * the backref was added previously when processsing
+		 * the backref was added previously when processing
 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 		 */
 		BUG_ON(!list_is_singular(&cur->upper));
@@ -749,7 +783,8 @@ again:
 				break;
 			}
 
-			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+			    key.type == BTRFS_METADATA_ITEM_KEY) {
 				ret = find_inline_backref(eb, path1->slots[0],
 							  &ptr, &end);
 				if (ret)
@@ -852,7 +887,7 @@ again:
 			goto out;
 		}
 
-		if (!root->ref_cows)
+		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 			cur->cowonly = 1;
 
 		if (btrfs_root_level(&root->root_item) == cur->level) {
@@ -889,6 +924,7 @@ again:
 			cur->bytenr);
 
 		lower = cur;
+		need_check = true;
 		for (; level < BTRFS_MAX_LEVEL; level++) {
 			if (!path2->nodes[level]) {
 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
@@ -918,7 +954,8 @@ again:
 				upper->bytenr = eb->start;
 				upper->owner = btrfs_header_owner(eb);
 				upper->level = lower->level + 1;
-				if (!root->ref_cows)
+				if (!test_bit(BTRFS_ROOT_REF_COWS,
+					      &root->state))
 					upper->cowonly = 1;
 
 				/*
@@ -932,14 +969,12 @@ again:
 
 				/*
 				 * add the block to pending list if we
-				 * need check its backrefs. only block
-				 * at 'cur->level + 1' is added to the
-				 * tail of pending list. this guarantees
-				 * we check backrefs from lower level
-				 * blocks to upper level blocks.
+				 * need check its backrefs, we only do this once
+				 * while walking up a tree as we will catch
+				 * anything else later on.
 				 */
-				if (!upper->checked &&
-				    level == cur->level + 1) {
+				if (!upper->checked && need_check) {
+					need_check = false;
 					list_add_tail(&edge->list[UPPER],
 						      &list);
 				} else
@@ -961,7 +996,7 @@ again:
 			lower = upper;
 			upper = NULL;
 		}
-		btrfs_release_path(root, path2);
+		btrfs_release_path(path2);
 next:
 		if (ptr < end) {
 			ptr += btrfs_extent_inline_ref_size(key.type);
@@ -974,7 +1009,7 @@ next:
 		if (ptr >= end)
 			path1->slots[0]++;
 	}
-	btrfs_release_path(rc->extent_root, path1);
+	btrfs_release_path(path1);
 
 	cur->checked = 1;
 	WARN_ON(exist);
@@ -996,7 +1031,8 @@ next:
 	if (!cowonly) {
 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
 				      &node->rb_node);
-		BUG_ON(rb_node);
+		if (rb_node)
+			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
 		list_add_tail(&node->lower, &cache->leaves);
 	}
 
@@ -1031,7 +1067,9 @@ next:
 		if (!cowonly) {
 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
 					      &upper->rb_node);
-			BUG_ON(rb_node);
+			if (rb_node)
+				backref_tree_panic(rb_node, -EEXIST,
+						   upper->bytenr);
 		}
 
 		list_add_tail(&edge->list[UPPER], &upper->lower);
@@ -1171,11 +1209,14 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
 			list_add_tail(&new_edge->list[UPPER],
 				      &new_node->lower);
 		}
+	} else {
+		list_add_tail(&new_node->lower, &cache->leaves);
 	}
 
 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
 			      &new_node->rb_node);
-	BUG_ON(rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
 
 	if (!new_node->lowest) {
 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
@@ -1198,14 +1239,15 @@ fail:
 /*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
-static int __add_reloc_root(struct btrfs_root *root)
+static int __must_check __add_reloc_root(struct btrfs_root *root)
 {
 	struct rb_node *rb_node;
 	struct mapping_node *node;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
 
 	node = kmalloc(sizeof(*node), GFP_NOFS);
-	BUG_ON(!node);
+	if (!node)
+		return -ENOMEM;
 
 	node->bytenr = root->node->start;
 	node->data = root;
@@ -1214,17 +1256,23 @@ static int __add_reloc_root(struct btrfs_root *root)
 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
 			      node->bytenr, &node->rb_node);
 	spin_unlock(&rc->reloc_root_tree.lock);
-	BUG_ON(rb_node);
+	if (rb_node) {
+		btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
+			    "for start=%llu while inserting into relocation "
+			    "tree", node->bytenr);
+		kfree(node);
+		return -EEXIST;
+	}
 
 	list_add_tail(&root->root_list, &rc->reloc_roots);
 	return 0;
 }
 
 /*
- * helper to update/delete the 'address of tree root -> reloc tree'
+ * helper to delete the 'address of tree root -> reloc tree'
  * mapping
  */
-static int __update_reloc_root(struct btrfs_root *root, int del)
+static void __del_reloc_root(struct btrfs_root *root)
 {
 	struct rb_node *rb_node;
 	struct mapping_node *node = NULL;
@@ -1232,26 +1280,53 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
 
 	spin_lock(&rc->reloc_root_tree.lock);
 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
-			      root->commit_root->start);
+			      root->node->start);
 	if (rb_node) {
 		node = rb_entry(rb_node, struct mapping_node, rb_node);
 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
 	}
 	spin_unlock(&rc->reloc_root_tree.lock);
 
+	if (!node)
+		return;
 	BUG_ON((struct btrfs_root *)node->data != root);
 
-	if (!del) {
-		spin_lock(&rc->reloc_root_tree.lock);
-		node->bytenr = root->node->start;
-		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
-				      node->bytenr, &node->rb_node);
-		spin_unlock(&rc->reloc_root_tree.lock);
-		BUG_ON(rb_node);
-	} else {
-		list_del_init(&root->root_list);
-		kfree(node);
+	spin_lock(&root->fs_info->trans_lock);
+	list_del_init(&root->root_list);
+	spin_unlock(&root->fs_info->trans_lock);
+	kfree(node);
+}
+
+/*
+ * helper to update the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->node->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
 	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	if (!node)
+		return 0;
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	node->bytenr = new_bytenr;
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
 	return 0;
 }
 
@@ -1262,6 +1337,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb;
 	struct btrfs_root_item *root_item;
 	struct btrfs_key root_key;
+	u64 last_snap = 0;
 	int ret;
 
 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1277,6 +1353,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 				      BTRFS_TREE_RELOC_OBJECTID);
 		BUG_ON(ret);
 
+		last_snap = btrfs_root_last_snapshot(&root->root_item);
 		btrfs_set_root_last_snapshot(&root->root_item,
 					     trans->transid - 1);
 	} else {
@@ -1302,6 +1379,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 		memset(&root_item->drop_progress, 0,
 		       sizeof(struct btrfs_disk_key));
 		root_item->drop_level = 0;
+		/*
+		 * abuse rtransid, it is safe because it is impossible to
+		 * receive data into a relocation tree.
+		 */
+		btrfs_set_root_rtransid(root_item, last_snap);
+		btrfs_set_root_otransid(root_item, trans->transid);
 	}
 
 	btrfs_tree_unlock(eb);
@@ -1312,8 +1395,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	kfree(root_item);
 
-	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						 &root_key);
+	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
 	BUG_ON(IS_ERR(reloc_root));
 	reloc_root->last_trans = trans->transid;
 	return reloc_root;
@@ -1328,7 +1410,9 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *reloc_root;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
+	struct btrfs_block_rsv *rsv;
 	int clear_rsv = 0;
+	int ret;
 
 	if (root->reloc_root) {
 		reloc_root = root->reloc_root;
@@ -1340,15 +1424,17 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
 		return 0;
 
-	if (!trans->block_rsv) {
+	if (!trans->reloc_reserved) {
+		rsv = trans->block_rsv;
 		trans->block_rsv = rc->block_rsv;
 		clear_rsv = 1;
 	}
 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
 	if (clear_rsv)
-		trans->block_rsv = NULL;
+		trans->block_rsv = rsv;
 
-	__add_reloc_root(reloc_root);
+	ret = __add_reloc_root(reloc_root);
+	BUG_ON(ret < 0);
 	root->reloc_root = reloc_root;
 	return 0;
 }
@@ -1361,11 +1447,10 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
-	int del = 0;
 	int ret;
 
 	if (!root->reloc_root)
-		return 0;
+		goto out;
 
 	reloc_root = root->reloc_root;
 	root_item = &reloc_root->root_item;
@@ -1373,11 +1458,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
 	    btrfs_root_refs(root_item) == 0) {
 		root->reloc_root = NULL;
-		del = 1;
+		__del_reloc_root(reloc_root);
 	}
 
-	__update_reloc_root(reloc_root, del);
-
 	if (reloc_root->commit_root != reloc_root->node) {
 		btrfs_set_root_node(root_item, reloc_root->node);
 		free_extent_buffer(reloc_root->commit_root);
@@ -1387,6 +1470,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
 				&reloc_root->root_key, root_item);
 	BUG_ON(ret);
+
+out:
 	return 0;
 }
 
@@ -1409,9 +1494,9 @@ again:
 		prev = node;
 		entry = rb_entry(node, struct btrfs_inode, rb_node);
 
-		if (objectid < entry->vfs_inode.i_ino)
+		if (objectid < btrfs_ino(&entry->vfs_inode))
 			node = node->rb_left;
-		else if (objectid > entry->vfs_inode.i_ino)
+		else if (objectid > btrfs_ino(&entry->vfs_inode))
 			node = node->rb_right;
 		else
 			break;
@@ -1419,7 +1504,7 @@ again:
 	if (!node) {
 		while (prev) {
 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
-			if (objectid <= entry->vfs_inode.i_ino) {
+			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
 				node = prev;
 				break;
 			}
@@ -1434,7 +1519,7 @@ again:
 			return inode;
 		}
 
-		objectid = entry->vfs_inode.i_ino + 1;
+		objectid = btrfs_ino(&entry->vfs_inode) + 1;
 		if (cond_resched_lock(&root->inode_lock))
 			goto again;
 
@@ -1470,7 +1555,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 		return -ENOMEM;
 
 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
-	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
 				       bytenr, 0);
 	if (ret < 0)
 		goto out;
@@ -1489,7 +1574,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 	       btrfs_file_extent_other_encoding(leaf, fi));
 
 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
-		ret = 1;
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -1520,7 +1605,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 	u64 end;
 	u32 nritems;
 	u32 i;
-	int ret;
+	int ret = 0;
 	int first = 1;
 	int dirty = 0;
 
@@ -1558,11 +1643,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 			if (first) {
 				inode = find_next_inode(root, key.objectid);
 				first = 0;
-			} else if (inode && inode->i_ino < key.objectid) {
+			} else if (inode && btrfs_ino(inode) < key.objectid) {
 				btrfs_add_delayed_iput(inode);
 				inode = find_next_inode(root, key.objectid);
 			}
-			if (inode && inode->i_ino == key.objectid) {
+			if (inode && btrfs_ino(inode) == key.objectid) {
 				end = key.offset +
 				      btrfs_file_extent_num_bytes(leaf, fi);
 				WARN_ON(!IS_ALIGNED(key.offset,
@@ -1570,25 +1655,26 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
 				end--;
 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-						      key.offset, end,
-						      GFP_NOFS);
+						      key.offset, end);
 				if (!ret)
 					continue;
 
 				btrfs_drop_extent_cache(inode, key.offset, end,
 							1);
 				unlock_extent(&BTRFS_I(inode)->io_tree,
-					      key.offset, end, GFP_NOFS);
+					      key.offset, end);
 			}
 		}
 
 		ret = get_new_location(rc->data_inode, &new_bytenr,
 				       bytenr, num_bytes);
-		if (ret > 0) {
-			WARN_ON(1);
-			continue;
+		if (ret) {
+			/*
+			 * Don't have to abort since we've not changed anything
+			 * in the file extent yet.
+			 */
+			break;
 		}
-		BUG_ON(ret < 0);
 
 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
 		dirty = 1;
@@ -1597,19 +1683,25 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset);
-		BUG_ON(ret);
+					   key.objectid, key.offset, 1);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset);
-		BUG_ON(ret);
+					key.objectid, key.offset, 1);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
 	}
 	if (dirty)
 		btrfs_mark_buffer_dirty(leaf);
 	if (inode)
 		btrfs_add_delayed_iput(inode);
-	return 0;
+	return ret;
 }
 
 static noinline_for_stack
@@ -1709,8 +1801,7 @@ again:
 			new_ptr_gen = 0;
 		}
 
-		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
-			WARN_ON(1);
+		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
 			ret = level;
 			break;
 		}
@@ -1724,6 +1815,11 @@ again:
 
 			eb = read_tree_block(dest, old_bytenr, blocksize,
 					     old_ptr_gen);
+			if (!eb || !extent_buffer_uptodate(eb)) {
+				ret = (!eb) ? -ENOMEM : -EIO;
+				free_extent_buffer(eb);
+				break;
+			}
 			btrfs_tree_lock(eb);
 			if (cow) {
 				ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1748,7 +1844,7 @@ again:
 
 		btrfs_node_key_to_cpu(path->nodes[level], &key,
 				      path->slots[level]);
-		btrfs_release_path(src, path);
+		btrfs_release_path(path);
 
 		path->lowest_level = level;
 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
@@ -1770,21 +1866,23 @@ again:
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -1874,6 +1972,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
 		blocksize = btrfs_level_size(root, i - 1);
 		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
 		BUG_ON(btrfs_header_level(eb) != i - 1);
 		path->nodes[i - 1] = eb;
 		path->slots[i - 1] = 0;
@@ -1892,6 +1994,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 	struct inode *inode = NULL;
 	u64 objectid;
 	u64 start, end;
+	u64 ino;
 
 	objectid = min_key->objectid;
 	while (1) {
@@ -1904,17 +2007,18 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 		inode = find_next_inode(root, objectid);
 		if (!inode)
 			break;
+		ino = btrfs_ino(inode);
 
-		if (inode->i_ino > max_key->objectid) {
+		if (ino > max_key->objectid) {
 			iput(inode);
 			break;
 		}
 
-		objectid = inode->i_ino + 1;
+		objectid = ino + 1;
 		if (!S_ISREG(inode->i_mode))
 			continue;
 
-		if (unlikely(min_key->objectid == inode->i_ino)) {
+		if (unlikely(min_key->objectid == ino)) {
 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
 				continue;
 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
@@ -1927,7 +2031,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 			start = 0;
 		}
 
-		if (unlikely(max_key->objectid == inode->i_ino)) {
+		if (unlikely(max_key->objectid == ino)) {
 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
 				continue;
 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
@@ -1944,9 +2048,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 		}
 
 		/* the lock_extent waits for readpage to complete */
-		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 		btrfs_drop_extent_cache(inode, start, end, 1);
-		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	}
 	return 0;
 }
@@ -1979,12 +2083,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	LIST_HEAD(inode_list);
 	struct btrfs_key key;
 	struct btrfs_key next_key;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	unsigned long nr;
 	int level;
 	int max_level;
 	int replaced = 0;
@@ -1995,6 +2098,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	reloc_root = root->reloc_root;
 	root_item = &reloc_root->root_item;
@@ -2028,18 +2132,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	memset(&next_key, 0, sizeof(next_key));
 
 	while (1) {
-		trans = btrfs_start_transaction(root, 0);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = rc->block_rsv;
-
-		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-					    min_reserved, 0);
+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+					     BTRFS_RESERVE_FLUSH_ALL);
 		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			continue;
+			err = ret;
+			goto out;
+		}
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
 		}
+		trans->block_rsv = rc->block_rsv;
 
 		replaced = 0;
 		max_level = level;
@@ -2084,10 +2189,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 			       path->slots[level]);
 		root_item->drop_level = level;
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction_throttle(trans, root);
+		trans = NULL;
 
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 
 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
 			invalidate_extent_cache(root, &key, &next_key);
@@ -2114,10 +2219,10 @@ out:
 		btrfs_update_reloc_root(trans, root);
 	}
 
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	if (trans)
+		btrfs_end_transaction_throttle(trans, root);
 
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 
 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
 		invalidate_extent_cache(root, &key, &next_key);
@@ -2135,20 +2240,21 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 	u64 num_bytes = 0;
 	int ret;
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	mutex_lock(&root->fs_info->reloc_mutex);
 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
 	rc->merging_rsv_size += rc->nodes_relocated * 2;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
-		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes);
+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+					  BTRFS_RESERVE_FLUSH_ALL);
 		if (ret)
 			err = ret;
 	}
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans)) {
 		if (!err)
 			btrfs_block_rsv_release(rc->extent_root,
@@ -2198,18 +2304,40 @@ again:
 }
 
 static noinline_for_stack
+void free_reloc_roots(struct list_head *list)
+{
+	struct btrfs_root *reloc_root;
+
+	while (!list_empty(list)) {
+		reloc_root = list_entry(list->next, struct btrfs_root,
+					root_list);
+		__del_reloc_root(reloc_root);
+	}
+}
+
+static noinline_for_stack
 int merge_reloc_roots(struct reloc_control *rc)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *reloc_root;
+	u64 last_snap;
+	u64 otransid;
+	u64 objectid;
 	LIST_HEAD(reloc_roots);
 	int found = 0;
-	int ret;
+	int ret = 0;
 again:
 	root = rc->extent_root;
-	mutex_lock(&root->fs_info->trans_mutex);
+
+	/*
+	 * this serializes us with btrfs_record_root_in_transaction,
+	 * we have to make sure nobody is in the middle of
+	 * adding their roots to the list while we are
+	 * doing this splice
+	 */
+	mutex_lock(&root->fs_info->reloc_mutex);
 	list_splice_init(&rc->reloc_roots, &reloc_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	mutex_unlock(&root->fs_info->reloc_mutex);
 
 	while (!list_empty(&reloc_roots)) {
 		found = 1;
@@ -2223,19 +2351,53 @@ again:
 			BUG_ON(root->reloc_root != reloc_root);
 
 			ret = merge_reloc_root(rc, root);
-			BUG_ON(ret);
+			if (ret) {
+				if (list_empty(&reloc_root->root_list))
+					list_add_tail(&reloc_root->root_list,
+						      &reloc_roots);
+				goto out;
+			}
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
-		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+
+		/*
+		 * we keep the old last snapshod transid in rtranid when we
+		 * created the relocation tree.
+		 */
+		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+		otransid = btrfs_root_otransid(&reloc_root->root_item);
+		objectid = reloc_root->root_key.offset;
+
+		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+		if (ret < 0) {
+			if (list_empty(&reloc_root->root_list))
+				list_add_tail(&reloc_root->root_list,
+					      &reloc_roots);
+			goto out;
+		}
 	}
 
 	if (found) {
 		found = 0;
 		goto again;
 	}
+out:
+	if (ret) {
+		btrfs_std_error(root->fs_info, ret);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
+
+		/* new reloc root may be added */
+		mutex_lock(&root->fs_info->reloc_mutex);
+		list_splice_init(&rc->reloc_roots, &reloc_roots);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
+	}
+
 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
-	return 0;
+	return ret;
 }
 
 static void free_block_list(struct rb_root *blocks)
@@ -2268,7 +2430,7 @@ static noinline_for_stack
 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 				     struct reloc_control *rc,
 				     struct backref_node *node,
-				     struct backref_edge *edges[], int *nr)
+				     struct backref_edge *edges[])
 {
 	struct backref_node *next;
 	struct btrfs_root *root;
@@ -2280,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 		next = walk_up_backref(next, edges, &index);
 		root = next->root;
 		BUG_ON(!root);
-		BUG_ON(!root->ref_cows);
+		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
 
 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
 			record_reloc_root_in_trans(trans, root);
@@ -2310,7 +2472,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 	if (!root)
 		return NULL;
 
-	*nr = index;
 	next = node;
 	/* setup backref node path for btrfs_reloc_cow_block */
 	while (1) {
@@ -2345,8 +2506,8 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
 		root = next->root;
 		BUG_ON(!root);
 
-		/* no other choice for non-refernce counted tree */
-		if (!root->ref_cows)
+		/* no other choice for non-references counted tree */
+		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 			return root;
 
 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2406,27 +2567,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = rc->extent_root;
 	u64 num_bytes;
 	int ret;
+	u64 tmp;
 
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+	rc->reserved_bytes += num_bytes;
+	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
+				BTRFS_RESERVE_FLUSH_ALL);
 	if (ret) {
-		if (ret == -EAGAIN)
-			rc->commit_transaction = 1;
+		if (ret == -EAGAIN) {
+			tmp = rc->extent_root->nodesize *
+				RELOCATION_RESERVED_NODES;
+			while (tmp <= rc->reserved_bytes)
+				tmp <<= 1;
+			/*
+			 * only one thread can access block_rsv at this point,
+			 * so we don't need hold lock to protect block_rsv.
+			 * we expand more reservation size here to allow enough
+			 * space for relocation and we will return eailer in
+			 * enospc case.
+			 */
+			rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+					      RELOCATION_RESERVED_NODES;
+		}
 		return ret;
 	}
 
 	return 0;
 }
 
-static void release_metadata_space(struct reloc_control *rc,
-				   struct backref_node *node)
-{
-	u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
-	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
-}
-
 /*
  * relocate a block tree, and then update pointers in upper level
  * blocks that reference the block to point to the new location.
@@ -2448,7 +2618,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 	u32 blocksize;
 	u64 bytenr;
 	u64 generation;
-	int nr;
 	int slot;
 	int ret;
 	int err = 0;
@@ -2461,7 +2630,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		cond_resched();
 
 		upper = edge->node[UPPER];
-		root = select_reloc_root(trans, rc, upper, edges, &nr);
+		root = select_reloc_root(trans, rc, upper, edges);
 		BUG_ON(!root);
 
 		if (upper->eb && !upper->locked) {
@@ -2495,7 +2664,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 			path->locks[upper->level] = 0;
 
 			slot = path->slots[upper->level];
-			btrfs_release_path(NULL, path);
+			btrfs_release_path(path);
 		} else {
 			ret = btrfs_bin_search(upper->eb, key, upper->level,
 					       &slot);
@@ -2513,6 +2682,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		blocksize = btrfs_level_size(root, node->level);
 		generation = btrfs_node_ptr_generation(upper->eb, slot);
 		eb = read_tree_block(root, bytenr, blocksize, generation);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			err = -EIO;
+			goto next;
+		}
 		btrfs_tree_lock(eb);
 		btrfs_set_lock_blocking(eb);
 
@@ -2537,7 +2711,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0);
+						node->level, 0, 1);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2670,6 +2844,10 @@ static int get_tree_block_key(struct reloc_control *rc,
 	BUG_ON(block->key_ready);
 	eb = read_tree_block(rc->extent_root, block->bytenr,
 			     block->key.objectid, block->key.offset);
+	if (!eb || !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		return -EIO;
+	}
 	WARN_ON(btrfs_header_level(eb) != block->level);
 	if (block->level == 0)
 		btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2684,8 +2862,13 @@ static int reada_tree_block(struct reloc_control *rc,
 			    struct tree_block *block)
 {
 	BUG_ON(block->key_ready);
-	readahead_tree_block(rc->extent_root, block->bytenr,
-			     block->key.objectid, block->key.offset);
+	if (block->key.type == BTRFS_METADATA_ITEM_KEY)
+		readahead_tree_block(rc->extent_root, block->bytenr,
+				     block->key.objectid,
+				     rc->extent_root->leafsize);
+	else
+		readahead_tree_block(rc->extent_root, block->bytenr,
+				     block->key.objectid, block->key.offset);
 	return 0;
 }
 
@@ -2699,7 +2882,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 				struct btrfs_path *path)
 {
 	struct btrfs_root *root;
-	int release = 0;
 	int ret = 0;
 
 	if (!node)
@@ -2712,15 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	if (!root || root->ref_cows) {
+	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
 		ret = reserve_metadata_space(trans, rc, node);
 		if (ret)
 			goto out;
-		release = 1;
 	}
 
 	if (root) {
-		if (root->ref_cows) {
+		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
 			BUG_ON(node->new_bytenr);
 			BUG_ON(!list_empty(&node->list));
 			btrfs_record_root_in_trans(trans, root);
@@ -2731,7 +2912,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		} else {
 			path->lowest_level = node->level;
 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			if (ret > 0)
 				ret = 0;
 		}
@@ -2741,11 +2922,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 		ret = do_relocation(trans, rc, node, key, path, 1);
 	}
 out:
-	if (ret || node->level == 0 || node->cowonly) {
-		if (release)
-			release_metadata_space(rc, node);
+	if (ret || node->level == 0 || node->cowonly)
 		remove_backref_node(&rc->backref_cache, node);
-	}
 	return ret;
 }
 
@@ -2764,8 +2942,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 	int err = 0;
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		err = -ENOMEM;
+		goto out_free_blocks;
+	}
 
 	rb_node = rb_first(blocks);
 	while (rb_node) {
@@ -2778,8 +2958,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 	rb_node = rb_first(blocks);
 	while (rb_node) {
 		block = rb_entry(rb_node, struct tree_block, rb_node);
-		if (!block->key_ready)
-			get_tree_block_key(rc, block);
+		if (!block->key_ready) {
+			err = get_tree_block_key(rc, block);
+			if (err)
+				goto out_free_path;
+		}
 		rb_node = rb_next(rb_node);
 	}
 
@@ -2804,10 +2987,12 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 		rb_node = rb_next(rb_node);
 	}
 out:
-	free_block_list(blocks);
 	err = finish_pending_nodes(trans, rc, path, err);
 
+out_free_path:
 	btrfs_free_path(path);
+out_free_blocks:
+	free_block_list(blocks);
 	return err;
 }
 
@@ -2838,12 +3023,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
 		else
 			end = cluster->end - offset;
 
-		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 		num_bytes = end + 1 - start;
 		ret = btrfs_prealloc_file_range(inode, 0, start,
 						num_bytes, num_bytes,
 						end + 1, &alloc_hint);
-		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 		if (ret)
 			break;
 		nr++;
@@ -2864,7 +3049,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
 	struct extent_map *em;
 	int ret = 0;
 
-	em = alloc_extent_map(GFP_NOFS);
+	em = alloc_extent_map();
 	if (!em)
 		return -ENOMEM;
 
@@ -2875,10 +3060,10 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	while (1) {
 		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
+		ret = add_extent_mapping(em_tree, em, 0);
 		write_unlock(&em_tree->lock);
 		if (ret != -EEXIST) {
 			free_extent_map(em);
@@ -2886,7 +3071,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
 		}
 		btrfs_drop_extent_cache(inode, start, end, 0);
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	return ret;
 }
 
@@ -2900,6 +3085,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	unsigned long last_index;
 	struct page *page;
 	struct file_ra_state *ra;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int nr = 0;
 	int ret = 0;
 
@@ -2933,7 +3119,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			page_cache_sync_readahead(inode->i_mapping,
 						  ra, NULL, index,
 						  last_index + 1 - index);
-			page = grab_cache_page(inode->i_mapping, index);
+			page = find_or_create_page(inode->i_mapping, index,
+						   mask);
 			if (!page) {
 				btrfs_delalloc_release_metadata(inode,
 							PAGE_CACHE_SIZE);
@@ -2961,11 +3148,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			}
 		}
 
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_start = page_offset(page);
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-		lock_extent(&BTRFS_I(inode)->io_tree,
-			    page_start, page_end, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
 
 		set_page_extent_mapped(page);
 
@@ -2981,7 +3167,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 		set_page_dirty(page);
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
-			      page_start, page_end, GFP_NOFS);
+			      page_start, page_end);
 		unlock_page(page);
 		page_cache_release(page);
 
@@ -3083,17 +3269,22 @@ static int add_tree_block(struct reloc_control *rc,
 	struct rb_node *rb_node;
 	u32 item_size;
 	int level = -1;
-	int generation;
+	u64 generation;
 
 	eb =  path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 
-	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+	    item_size >= sizeof(*ei) + sizeof(*bi)) {
 		ei = btrfs_item_ptr(eb, path->slots[0],
 				struct btrfs_extent_item);
-		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+			bi = (struct btrfs_tree_block_info *)(ei + 1);
+			level = btrfs_tree_block_level(eb, bi);
+		} else {
+			level = (int)extent_key->offset;
+		}
 		generation = btrfs_extent_generation(eb, ei);
-		level = btrfs_tree_block_level(eb, bi);
 	} else {
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 		u64 ref_owner;
@@ -3113,7 +3304,7 @@ static int add_tree_block(struct reloc_control *rc,
 #endif
 	}
 
-	btrfs_release_path(rc->extent_root, path);
+	btrfs_release_path(path);
 
 	BUG_ON(level == -1);
 
@@ -3122,13 +3313,14 @@ static int add_tree_block(struct reloc_control *rc,
 		return -ENOMEM;
 
 	block->bytenr = extent_key->objectid;
-	block->key.objectid = extent_key->offset;
+	block->key.objectid = rc->extent_root->leafsize;
 	block->key.offset = generation;
 	block->level = level;
 	block->key_ready = 0;
 
 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-	BUG_ON(rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
 
 	return 0;
 }
@@ -3143,6 +3335,8 @@ static int __add_tree_block(struct reloc_control *rc,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
+	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+					SKINNY_METADATA);
 
 	if (tree_block_processed(bytenr, blocksize, rc))
 		return 0;
@@ -3153,19 +3347,42 @@ static int __add_tree_block(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
+again:
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = blocksize;
+	if (skinny) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = (u64)-1;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = blocksize;
+	}
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
+
+	if (ret > 0 && skinny) {
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    (key.type == BTRFS_METADATA_ITEM_KEY ||
+			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			      key.offset == blocksize)))
+				ret = 0;
+		}
+
+		if (ret) {
+			skinny = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
 	BUG_ON(ret);
 
-	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 	ret = add_tree_block(rc, &key, path, blocks);
 out:
 	btrfs_free_path(path);
@@ -3186,7 +3403,8 @@ static int block_use_full_backref(struct reloc_control *rc,
 		return 1;
 
 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-				       eb->start, eb->len, NULL, &flags);
+				       eb->start, btrfs_header_level(eb), 1,
+				       NULL, &flags);
 	BUG_ON(ret);
 
 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
@@ -3200,10 +3418,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 				    struct inode *inode, u64 ino)
 {
 	struct btrfs_key key;
-	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
 	int ret = 0;
 
 	if (inode)
@@ -3214,32 +3430,28 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 	key.offset = 0;
 
 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-	if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
-		if (inode && !IS_ERR(inode))
+	if (IS_ERR(inode) || is_bad_inode(inode)) {
+		if (!IS_ERR(inode))
 			iput(inode);
 		return -ENOENT;
 	}
 
 truncate:
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
+	ret = btrfs_check_trunc_cache_free_space(root,
+						 &fs_info->global_block_rsv);
+	if (ret)
 		goto out;
-	}
 
-	trans = btrfs_join_transaction(root, 0);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
 		ret = PTR_ERR(trans);
 		goto out;
 	}
 
-	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+	ret = btrfs_truncate_free_space_cache(root, trans, inode);
 
-	btrfs_free_path(path);
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 out:
 	iput(inode);
 	return ret;
@@ -3291,6 +3503,7 @@ static int find_data_references(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
 	if (IS_ERR(root)) {
@@ -3299,8 +3512,11 @@ static int find_data_references(struct reloc_control *rc,
 	}
 
 	key.objectid = ref_objectid;
-	key.offset = ref_offset;
 	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (ref_offset > ((u64)-1 << 32))
+		key.offset = 0;
+	else
+		key.offset = ref_offset;
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
@@ -3335,10 +3551,8 @@ static int find_data_references(struct reloc_control *rc,
 				err = ret;
 				goto out;
 			}
-			if (ret > 0) {
-				WARN_ON(1);
+			if (WARN_ON(ret > 0))
 				goto out;
-			}
 
 			leaf = path->nodes[0];
 			nritems = btrfs_header_nritems(leaf);
@@ -3358,11 +3572,9 @@ static int find_data_references(struct reloc_control *rc,
 		}
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid != ref_objectid ||
-		    key.type != BTRFS_EXTENT_DATA_KEY) {
-			WARN_ON(1);
+		if (WARN_ON(key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY))
 			break;
-		}
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
@@ -3396,7 +3608,9 @@ static int find_data_references(struct reloc_control *rc,
 			block->key_ready = 1;
 			rb_node = tree_insert(blocks, block->bytenr,
 					      &block->rb_node);
-			BUG_ON(rb_node);
+			if (rb_node)
+				backref_tree_panic(rb_node, -EEXIST,
+						   block->bytenr);
 		}
 		if (counted)
 			added = 1;
@@ -3412,7 +3626,7 @@ out:
 }
 
 /*
- * hepler to find all tree blocks that reference a given data extent
+ * helper to find all tree blocks that reference a given data extent
  */
 static noinline_for_stack
 int add_data_references(struct reloc_control *rc,
@@ -3427,7 +3641,7 @@ int add_data_references(struct reloc_control *rc,
 	unsigned long ptr;
 	unsigned long end;
 	u32 blocksize = btrfs_level_size(rc->extent_root, 0);
-	int ret;
+	int ret = 0;
 	int err = 0;
 
 	eb = path->nodes[0];
@@ -3454,6 +3668,10 @@ int add_data_references(struct reloc_control *rc,
 		} else {
 			BUG();
 		}
+		if (ret) {
+			err = ret;
+			goto out;
+		}
 		ptr += btrfs_extent_inline_ref_size(key.type);
 	}
 	WARN_ON(ptr > end);
@@ -3499,14 +3717,15 @@ int add_data_references(struct reloc_control *rc,
 		}
 		path->slots[0]++;
 	}
-	btrfs_release_path(rc->extent_root, path);
+out:
+	btrfs_release_path(path);
 	if (err)
 		free_block_list(blocks);
 	return err;
 }
 
 /*
- * hepler to find next unprocessed extent
+ * helper to find next unprocessed extent
  */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
@@ -3551,43 +3770,62 @@ next:
 			break;
 		}
 
-		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+		    key.type != BTRFS_METADATA_ITEM_KEY) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 		    key.objectid + key.offset <= rc->search_start) {
 			path->slots[0]++;
 			goto next;
 		}
 
+		if (key.type == BTRFS_METADATA_ITEM_KEY &&
+		    key.objectid + rc->extent_root->leafsize <=
+		    rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
 		ret = find_first_extent_bit(&rc->processed_blocks,
 					    key.objectid, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 
 		if (ret == 0 && start <= key.objectid) {
-			btrfs_release_path(rc->extent_root, path);
+			btrfs_release_path(path);
 			rc->search_start = end + 1;
 		} else {
-			rc->search_start = key.objectid + key.offset;
+			if (key.type == BTRFS_EXTENT_ITEM_KEY)
+				rc->search_start = key.objectid + key.offset;
+			else
+				rc->search_start = key.objectid +
+					rc->extent_root->leafsize;
 			memcpy(extent_key, &key, sizeof(key));
 			return 0;
 		}
 	}
-	btrfs_release_path(rc->extent_root, path);
+	btrfs_release_path(path);
 	return ret;
 }
 
 static void set_reloc_control(struct reloc_control *rc)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	mutex_lock(&fs_info->trans_mutex);
+
+	mutex_lock(&fs_info->reloc_mutex);
 	fs_info->reloc_ctl = rc;
-	mutex_unlock(&fs_info->trans_mutex);
+	mutex_unlock(&fs_info->reloc_mutex);
 }
 
 static void unset_reloc_control(struct reloc_control *rc)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	mutex_lock(&fs_info->trans_mutex);
+
+	mutex_lock(&fs_info->reloc_mutex);
 	fs_info->reloc_ctl = NULL;
-	mutex_unlock(&fs_info->trans_mutex);
+	mutex_unlock(&fs_info->reloc_mutex);
 }
 
 static int check_extent_flags(u64 flags)
@@ -3608,36 +3846,34 @@ static noinline_for_stack
 int prepare_to_relocate(struct reloc_control *rc)
 {
 	struct btrfs_trans_handle *trans;
-	int ret;
 
-	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+					      BTRFS_BLOCK_RSV_TEMP);
 	if (!rc->block_rsv)
 		return -ENOMEM;
 
-	/*
-	 * reserve some space for creating reloc trees.
-	 * btrfs_init_reloc_root will use them when there
-	 * is no reservation in transaction handle.
-	 */
-	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256);
-	if (ret)
-		return ret;
-
-	rc->block_rsv->refill_used = 1;
-	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
-
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
 	rc->extents_found = 0;
 	rc->nodes_relocated = 0;
 	rc->merging_rsv_size = 0;
+	rc->reserved_bytes = 0;
+	rc->block_rsv->size = rc->extent_root->nodesize *
+			      RELOCATION_RESERVED_NODES;
 
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
-	BUG_ON(IS_ERR(trans));
+	trans = btrfs_join_transaction(rc->extent_root);
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		/*
+		 * extent tree is not a ref_cow tree and has no reloc_root to
+		 * cleanup.  And callers are responsible to free the above
+		 * block rsv.
+		 */
+		return PTR_ERR(trans);
+	}
 	btrfs_commit_transaction(trans, rc->extent_root);
 	return 0;
 }
@@ -3649,7 +3885,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_path *path;
 	struct btrfs_extent_item *ei;
-	unsigned long nr;
 	u64 flags;
 	u32 item_size;
 	int ret;
@@ -3659,6 +3894,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	ret = prepare_to_relocate(rc);
 	if (ret) {
@@ -3667,9 +3903,21 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	}
 
 	while (1) {
+		rc->reserved_bytes = 0;
+		ret = btrfs_block_rsv_refill(rc->extent_root,
+					rc->block_rsv, rc->block_rsv->size,
+					BTRFS_RESERVE_FLUSH_ALL);
+		if (ret) {
+			err = ret;
+			break;
+		}
 		progress++;
 		trans = btrfs_start_transaction(rc->extent_root, 0);
-		BUG_ON(IS_ERR(trans));
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
 restart:
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
@@ -3707,7 +3955,7 @@ restart:
 				flags = BTRFS_EXTENT_FLAG_DATA;
 
 			if (path_change) {
-				btrfs_release_path(rc->extent_root, path);
+				btrfs_release_path(path);
 
 				path->search_commit_root = 1;
 				path->skip_locking = 1;
@@ -3730,7 +3978,7 @@ restart:
 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
 			ret = add_data_references(rc, &key, path, &blocks);
 		} else {
-			btrfs_release_path(rc->extent_root, path);
+			btrfs_release_path(path);
 			ret = 0;
 		}
 		if (ret < 0) {
@@ -3741,6 +3989,12 @@ restart:
 		if (!RB_EMPTY_ROOT(&blocks)) {
 			ret = relocate_tree_blocks(trans, rc, &blocks);
 			if (ret < 0) {
+				/*
+				 * if we fail to relocate tree blocks, force to update
+				 * backref cache when committing transaction.
+				 */
+				rc->backref_cache.last_trans = trans->transid - 1;
+
 				if (ret != -EAGAIN) {
 					err = ret;
 					break;
@@ -3750,26 +4004,8 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(trans, rc->extent_root,
-					    rc->block_rsv, 0, 5);
-		if (ret < 0) {
-			if (ret != -EAGAIN) {
-				err = ret;
-				WARN_ON(1);
-				break;
-			}
-			rc->commit_transaction = 1;
-		}
-
-		if (rc->commit_transaction) {
-			rc->commit_transaction = 0;
-			ret = btrfs_commit_transaction(trans, rc->extent_root);
-			BUG_ON(ret);
-		} else {
-			nr = trans->blocks_used;
-			btrfs_end_transaction_throttle(trans, rc->extent_root);
-			btrfs_btree_balance_dirty(rc->extent_root, nr);
-		}
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root);
 		trans = NULL;
 
 		if (rc->stage == MOVE_DATA_EXTENTS &&
@@ -3793,14 +4029,13 @@ restart:
 		}
 	}
 
-	btrfs_release_path(rc->extent_root, path);
+	btrfs_release_path(path);
 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
 			  GFP_NOFS);
 
 	if (trans) {
-		nr = trans->blocks_used;
 		btrfs_end_transaction_throttle(trans, rc->extent_root);
-		btrfs_btree_balance_dirty(rc->extent_root, nr);
+		btrfs_btree_balance_dirty(rc->extent_root);
 	}
 
 	if (!err) {
@@ -3825,7 +4060,7 @@ restart:
 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
 
 	/* get rid of pinned extents */
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans))
 		err = PTR_ERR(trans);
 	else
@@ -3861,7 +4096,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
 					  BTRFS_INODE_PREALLOC);
 	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3879,7 +4114,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	unsigned long nr;
 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
 	int err = 0;
 
@@ -3891,7 +4125,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(trans))
 		return ERR_CAST(trans);
 
-	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+	err = btrfs_find_free_objectid(root, &objectid);
 	if (err)
 		goto out;
 
@@ -3907,9 +4141,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 
 	err = btrfs_orphan_add(trans, inode);
 out:
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	if (err) {
 		if (inode)
 			iput(inode);
@@ -3918,7 +4151,7 @@ out:
 	return inode;
 }
 
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 {
 	struct reloc_control *rc;
 
@@ -3929,7 +4162,8 @@ static struct reloc_control *alloc_reloc_control(void)
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	backref_cache_init(&rc->backref_cache);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+	extent_io_tree_init(&rc->processed_blocks,
+			    fs_info->btree_inode->i_mapping);
 	return rc;
 }
 
@@ -3946,7 +4180,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	int rw = 0;
 	int err = 0;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(fs_info);
 	if (!rc)
 		return -ENOMEM;
 
@@ -3991,19 +4225,19 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		goto out;
 	}
 
-	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
-	       (unsigned long long)rc->block_group->key.objectid,
-	       (unsigned long long)rc->block_group->flags);
+	btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
+	       rc->block_group->key.objectid, rc->block_group->flags);
 
-	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	btrfs_wait_ordered_roots(fs_info, -1);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
-
-		btrfs_clean_old_snapshots(fs_info->tree_root);
 		ret = relocate_block_group(rc);
-
 		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			err = ret;
@@ -4013,22 +4247,22 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		if (rc->extents_found == 0)
 			break;
 
-		printk(KERN_INFO "btrfs: found %llu extents\n",
-			(unsigned long long)rc->extents_found);
+		btrfs_info(extent_root->fs_info, "found %llu extents",
+			rc->extents_found);
 
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
-			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+						       (u64)-1);
+			if (ret) {
+				err = ret;
+				goto out;
+			}
 			invalidate_mapping_pages(rc->data_inode->i_mapping,
 						 0, -1);
 			rc->stage = UPDATE_DATA_PTRS;
 		}
 	}
 
-	filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
-				     rc->block_group->key.objectid,
-				     rc->block_group->key.objectid +
-				     rc->block_group->key.offset - 1);
-
 	WARN_ON(rc->block_group->pinned > 0);
 	WARN_ON(rc->block_group->reserved > 0);
 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
@@ -4044,10 +4278,11 @@ out:
 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle *trans;
-	int ret;
+	int ret, err;
 
 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	memset(&root->root_item.drop_progress, 0,
 		sizeof(root->root_item.drop_progress));
@@ -4055,11 +4290,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
 	btrfs_set_root_refs(&root->root_item, 0);
 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
 				&root->root_key, &root->root_item);
-	BUG_ON(ret);
 
-	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
-	BUG_ON(ret);
-	return 0;
+	err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+	if (err)
+		return err;
+	return ret;
 }
 
 /*
@@ -4084,6 +4319,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = -1;
 
 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
 	key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4103,13 +4339,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 		}
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		btrfs_release_path(root->fs_info->tree_root, path);
+		btrfs_release_path(path);
 
 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
 		    key.type != BTRFS_ROOT_ITEM_KEY)
 			break;
 
-		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+		reloc_root = btrfs_read_fs_root(root, &key);
 		if (IS_ERR(reloc_root)) {
 			err = PTR_ERR(reloc_root);
 			goto out;
@@ -4126,7 +4362,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 					err = ret;
 					goto out;
 				}
-				mark_garbage_root(reloc_root);
+				ret = mark_garbage_root(reloc_root);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
 			}
 		}
 
@@ -4135,12 +4375,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 
 		key.offset--;
 	}
-	btrfs_release_path(root->fs_info->tree_root, path);
+	btrfs_release_path(path);
 
 	if (list_empty(&reloc_roots))
 		goto out;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(root->fs_info);
 	if (!rc) {
 		err = -ENOMEM;
 		goto out;
@@ -4150,7 +4390,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 
 	set_reloc_control(rc);
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans)) {
 		unset_reloc_control(rc);
 		err = PTR_ERR(trans);
@@ -4172,34 +4412,35 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 
 		fs_root = read_fs_root(root->fs_info,
 				       reloc_root->root_key.offset);
-		BUG_ON(IS_ERR(fs_root));
+		if (IS_ERR(fs_root)) {
+			err = PTR_ERR(fs_root);
+			goto out_free;
+		}
 
-		__add_reloc_root(reloc_root);
+		err = __add_reloc_root(reloc_root);
+		BUG_ON(err < 0); /* -ENOMEM or logic error */
 		fs_root->reloc_root = reloc_root;
 	}
 
-	btrfs_commit_transaction(trans, rc->extent_root);
+	err = btrfs_commit_transaction(trans, rc->extent_root);
+	if (err)
+		goto out_free;
 
 	merge_reloc_roots(rc);
 
 	unset_reloc_control(rc);
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans))
 		err = PTR_ERR(trans);
 	else
-		btrfs_commit_transaction(trans, rc->extent_root);
+		err = btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	kfree(rc);
 out:
-	while (!list_empty(&reloc_roots)) {
-		reloc_root = list_entry(reloc_roots.next,
-					struct btrfs_root, root_list);
-		list_del(&reloc_root->root_list);
-		free_extent_buffer(reloc_root->node);
-		free_extent_buffer(reloc_root->commit_root);
-		kfree(reloc_root);
-	}
+	if (!list_empty(&reloc_roots))
+		free_reloc_roots(&reloc_roots);
+
 	btrfs_free_path(path);
 
 	if (err == 0) {
@@ -4209,7 +4450,7 @@ out:
 		if (IS_ERR(fs_root))
 			err = PTR_ERR(fs_root);
 		else
-			btrfs_orphan_cleanup(fs_root);
+			err = btrfs_orphan_cleanup(fs_root);
 	}
 	return err;
 }
@@ -4223,12 +4464,11 @@ out:
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 {
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	size_t offset;
 	int ret;
 	u64 disk_bytenr;
+	u64 new_bytenr;
 	LIST_HEAD(list);
 
 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
@@ -4236,45 +4476,58 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 
 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-				       disk_bytenr + len - 1, &list);
+				       disk_bytenr + len - 1, &list, 0);
+	if (ret)
+		goto out;
 
 	while (!list_empty(&list)) {
 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 		list_del_init(&sums->list);
 
-		sector_sum = sums->sums;
-		sums->bytenr = ordered->start;
-
-		offset = 0;
-		while (offset < sums->len) {
-			sector_sum->bytenr += ordered->start - disk_bytenr;
-			sector_sum++;
-			offset += root->sectorsize;
-		}
+		/*
+		 * We need to offset the new_bytenr based on where the csum is.
+		 * We need to do this because we will read in entire prealloc
+		 * extents but we may have written to say the middle of the
+		 * prealloc extent, so we need to make sure the csum goes with
+		 * the right disk offset.
+		 *
+		 * We can do this because the data reloc inode refers strictly
+		 * to the on disk bytes, so we don't have to worry about
+		 * disk_len vs real len like with real inodes since it's all
+		 * disk length.
+		 */
+		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+		sums->bytenr = new_bytenr;
 
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}
+out:
 	btrfs_put_ordered_extent(ordered);
 	return ret;
 }
 
-void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct extent_buffer *buf,
-			   struct extent_buffer *cow)
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow)
 {
 	struct reloc_control *rc;
 	struct backref_node *node;
 	int first_cow = 0;
 	int level;
-	int ret;
+	int ret = 0;
 
 	rc = root->fs_info->reloc_ctl;
 	if (!rc)
-		return;
+		return 0;
 
 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
 
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (buf == root->node)
+			__update_reloc_root(root, cow->start);
+	}
+
 	level = btrfs_header_level(buf);
 	if (btrfs_header_generation(buf) <=
 	    btrfs_root_last_snapshot(&root->root_item))
@@ -4306,10 +4559,9 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 			rc->nodes_relocated += buf->len;
 	}
 
-	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
 		ret = replace_file_extents(trans, rc, root, cow);
-		BUG_ON(ret);
-	}
+	return ret;
 }
 
 /*
@@ -4350,7 +4602,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
  * called after snapshot is created. migrate block reservation
  * and create reloc root for the newly created snapshot
  */
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			       struct btrfs_pending_snapshot *pending)
 {
 	struct btrfs_root *root = pending->root;
@@ -4360,7 +4612,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (!root->reloc_root)
-		return;
+		return 0;
 
 	rc = root->fs_info->reloc_ctl;
 	rc->merging_rsv_size += rc->nodes_relocated;
@@ -4369,18 +4621,21 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
 					      rc->block_rsv,
 					      rc->nodes_relocated);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
 	new_root = pending->snap;
 	reloc_root = create_reloc_root(trans, root->reloc_root,
 				       new_root->root_key.objectid);
+	if (IS_ERR(reloc_root))
+		return PTR_ERR(reloc_root);
 
-	__add_reloc_root(reloc_root);
+	ret = __add_reloc_root(reloc_root);
+	BUG_ON(ret < 0);
 	new_root->reloc_root = reloc_root;
 
-	if (rc->create_reloc_tree) {
+	if (rc->create_reloc_tree)
 		ret = clone_backref_node(trans, rc, root, reloc_root);
-		BUG_ON(ret);
-	}
+	return ret;
 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a1086e83ff..360a728a639 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,114 +16,117 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/err.h>
+#include <linux/uuid.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
 
 /*
- *  search forward for a root, starting with objectid 'search_start'
- *  if a root key is found, the objectid we find is filled into 'found_objectid'
- *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
- *  left in the tree.
+ * Read a root item from the tree. In case we detect a root item smaller then
+ * sizeof(root_item), we know it's an old version of the root structure and
+ * initialize all new fields to zero. The same happens if we detect mismatching
+ * generation numbers as then we know the root was once mounted with an older
+ * kernel that was not aware of the root item structure change.
  */
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
-		      u64 *found_objectid)
+static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+				struct btrfs_root_item *item)
 {
-	struct btrfs_path *path;
-	struct btrfs_key search_key;
-	int ret;
-
-	root = root->fs_info->tree_root;
-	search_key.objectid = search_start;
-	search_key.type = (u8)-1;
-	search_key.offset = (u64)-1;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-again:
-	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-	if (ret == 0) {
-		ret = 1;
-		goto out;
-	}
-	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-		ret = btrfs_next_leaf(root, path);
-		if (ret)
-			goto out;
-	}
-	btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
-	if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
-		search_key.offset++;
-		btrfs_release_path(root, path);
-		goto again;
+	uuid_le uuid;
+	int len;
+	int need_reset = 0;
+
+	len = btrfs_item_size_nr(eb, slot);
+	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+			min_t(int, len, (int)sizeof(*item)));
+	if (len < sizeof(*item))
+		need_reset = 1;
+	if (!need_reset && btrfs_root_generation(item)
+		!= btrfs_root_generation_v2(item)) {
+		if (btrfs_root_generation_v2(item) != 0) {
+			printk(KERN_WARNING "BTRFS: mismatching "
+					"generation and generation_v2 "
+					"found in root item. This root "
+					"was probably mounted with an "
+					"older kernel. Resetting all "
+					"new fields.\n");
+		}
+		need_reset = 1;
 	}
-	ret = 0;
-	*found_objectid = search_key.objectid;
+	if (need_reset) {
+		memset(&item->generation_v2, 0,
+			sizeof(*item) - offsetof(struct btrfs_root_item,
+					generation_v2));
 
-out:
-	btrfs_free_path(path);
-	return ret;
+		uuid_le_gen(&uuid);
+		memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+	}
 }
 
 /*
- * lookup the root with the highest offset for a given objectid.  The key we do
- * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
- * on error.
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
  */
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
-			struct btrfs_root_item *item, struct btrfs_key *key)
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key)
 {
-	struct btrfs_path *path;
-	struct btrfs_key search_key;
 	struct btrfs_key found_key;
 	struct extent_buffer *l;
 	int ret;
 	int slot;
 
-	search_key.objectid = objectid;
-	search_key.type = BTRFS_ROOT_ITEM_KEY;
-	search_key.offset = (u64)-1;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 
-	BUG_ON(ret == 0);
-	if (path->slots[0] == 0) {
-		ret = 1;
-		goto out;
+	if (search_key->offset != -1ULL) {	/* the search key is exact */
+		if (ret > 0)
+			goto out;
+	} else {
+		BUG_ON(ret == 0);		/* Logical error */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+		ret = 0;
 	}
+
 	l = path->nodes[0];
-	slot = path->slots[0] - 1;
+	slot = path->slots[0];
+
 	btrfs_item_key_to_cpu(l, &found_key, slot);
-	if (found_key.objectid != objectid ||
+	if (found_key.objectid != search_key->objectid ||
 	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
 		ret = 1;
 		goto out;
 	}
-	if (item)
-		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
-				   sizeof(*item));
-	if (key)
-		memcpy(key, &found_key, sizeof(found_key));
-	ret = 0;
+
+	if (root_item)
+		btrfs_read_root_item(l, slot, root_item);
+	if (root_key)
+		memcpy(root_key, &found_key, sizeof(found_key));
 out:
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 	return ret;
 }
 
-int btrfs_set_root_node(struct btrfs_root_item *item,
-			struct extent_buffer *node)
+void btrfs_set_root_node(struct btrfs_root_item *item,
+			 struct extent_buffer *node)
 {
 	btrfs_set_root_bytenr(item, node->start);
 	btrfs_set_root_level(item, btrfs_header_level(node));
 	btrfs_set_root_generation(item, btrfs_header_generation(node));
-	return 0;
 }
 
 /*
@@ -138,24 +141,67 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	int slot;
 	unsigned long ptr;
+	int old_len;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0)
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
 		goto out;
+	}
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
-		       (unsigned long long)key->objectid, key->type,
-		       (unsigned long long)key->offset);
+		btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
+		       key->objectid, key->type, key->offset);
 		BUG_ON(1);
 	}
 
 	l = path->nodes[0];
 	slot = path->slots[0];
 	ptr = btrfs_item_ptr_offset(l, slot);
+	old_len = btrfs_item_size_nr(l, slot);
+
+	/*
+	 * If this is the first time we update the root item which originated
+	 * from an older kernel, we need to enlarge the item size to make room
+	 * for the added fields.
+	 */
+	if (old_len < sizeof(*item)) {
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(trans, root, key, path,
+				-1, 1);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
+		ret = btrfs_del_item(trans, root, path);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, root, path,
+				key, sizeof(*item));
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+		l = path->nodes[0];
+		slot = path->slots[0];
+		ptr = btrfs_item_ptr_offset(l, slot);
+	}
+
+	/*
+	 * Update generation_v2 so at the next mount we know the new root
+	 * fields are valid.
+	 */
+	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+
 	write_extent_buffer(l, item, ptr, sizeof(*item));
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
@@ -163,93 +209,14 @@ out:
 	return ret;
 }
 
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_key *key, struct btrfs_root_item
-		      *item)
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct btrfs_key *key, struct btrfs_root_item *item)
 {
-	int ret;
-	ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
-	return ret;
-}
-
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an
- * offset lower than the latest root.  They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
-	struct btrfs_root *dead_root;
-	struct btrfs_root_item *ri;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct btrfs_path *path;
-	int ret;
-	u32 nritems;
-	struct extent_buffer *leaf;
-	int slot;
-
-	key.objectid = objectid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	key.offset = 0;
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-again:
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto err;
-	while (1) {
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		slot = path->slots[0];
-		if (slot >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				break;
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-			slot = path->slots[0];
-		}
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
-			goto next;
-
-		if (key.objectid < objectid)
-			goto next;
-
-		if (key.objectid > objectid)
-			break;
-
-		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
-		if (btrfs_disk_root_refs(leaf, ri) != 0)
-			goto next;
-
-		memcpy(&found_key, &key, sizeof(key));
-		key.offset++;
-		btrfs_release_path(root, path);
-		dead_root =
-			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						    &found_key);
-		if (IS_ERR(dead_root)) {
-			ret = PTR_ERR(dead_root);
-			goto err;
-		}
-
-		ret = btrfs_add_dead_root(dead_root);
-		if (ret)
-			goto err;
-		goto again;
-next:
-		slot++;
-		path->slots[0]++;
-	}
-	ret = 0;
-err:
-	btrfs_free_path(path);
-	return ret;
+	/*
+	 * Make sure generation v1 and v2 match. See update_root for details.
+	 */
+	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+	return btrfs_insert_item(trans, root, key, item, sizeof(*item));
 }
 
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
@@ -261,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 	struct btrfs_root *root;
 	int err = 0;
 	int ret;
+	bool can_recover = true;
+
+	if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+		can_recover = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -291,7 +262,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		}
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		btrfs_release_path(tree_root, path);
+		btrfs_release_path(path);
 
 		if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
@@ -300,22 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		root_key.objectid = key.offset;
 		key.offset++;
 
-		root = btrfs_read_fs_root_no_name(tree_root->fs_info,
-						  &root_key);
-		if (!IS_ERR(root))
+		root = btrfs_read_fs_root(tree_root, &root_key);
+		err = PTR_ERR_OR_ZERO(root);
+		if (err && err != -ENOENT) {
+			break;
+		} else if (err == -ENOENT) {
+			struct btrfs_trans_handle *trans;
+
+			btrfs_release_path(path);
+
+			trans = btrfs_join_transaction(tree_root);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to start trans to delete "
+					    "orphan item");
+				break;
+			}
+			err = btrfs_del_orphan_item(trans, tree_root,
+						    root_key.objectid);
+			btrfs_end_transaction(trans, tree_root);
+			if (err) {
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to delete root orphan "
+					    "item");
+				break;
+			}
 			continue;
+		}
 
-		ret = PTR_ERR(root);
-		if (ret != -ENOENT) {
-			err = ret;
+		err = btrfs_init_fs_root(root);
+		if (err) {
+			btrfs_free_fs_root(root);
 			break;
 		}
 
-		ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
-		if (ret) {
-			err = ret;
+		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
+
+		err = btrfs_insert_fs_root(root->fs_info, root);
+		if (err) {
+			BUG_ON(err == -EEXIST);
+			btrfs_free_fs_root(root);
 			break;
 		}
+
+		if (btrfs_root_refs(&root->root_item) == 0)
+			btrfs_add_dead_root(root);
 	}
 
 	btrfs_free_path(path);
@@ -328,18 +329,15 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	int ret;
-	struct btrfs_root_item *ri;
-	struct extent_buffer *leaf;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
 
 	BUG_ON(ret != 0);
-	leaf = path->nodes[0];
-	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
 	ret = btrfs_del_item(trans, root, path);
 out:
@@ -383,37 +381,26 @@ again:
 		*sequence = btrfs_root_ref_sequence(leaf, ref);
 
 		ret = btrfs_del_item(trans, tree_root, path);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
 	} else
 		err = -ENOENT;
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
-		btrfs_release_path(tree_root, path);
+		btrfs_release_path(path);
 		key.objectid = ref_id;
 		key.type = BTRFS_ROOT_REF_KEY;
 		key.offset = root_id;
 		goto again;
 	}
 
+out:
 	btrfs_free_path(path);
 	return err;
 }
 
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
-		   struct btrfs_path *path,
-		   u64 root_id, u64 ref_id)
-{
-	struct btrfs_key key;
-	int ret;
-
-	key.objectid = root_id;
-	key.type = BTRFS_ROOT_REF_KEY;
-	key.offset = ref_id;
-
-	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
-	return ret;
-}
-
 /*
  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
  * or BTRFS_ROOT_BACKREF_KEY.
@@ -426,6 +413,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
  *
  * For a back ref the root_id is the id of the subvol or snapshot and
  * ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
  */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
@@ -449,7 +438,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
 				      sizeof(*ref) + name_len);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, tree_root, ret);
+		btrfs_free_path(path);
+		return ret;
+	}
 
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
@@ -461,7 +454,7 @@ again:
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
-		btrfs_release_path(tree_root, path);
+		btrfs_release_path(path);
 		key.objectid = ref_id;
 		key.type = BTRFS_ROOT_REF_KEY;
 		key.offset = root_id;
@@ -471,3 +464,34 @@ again:
 	btrfs_free_path(path);
 	return 0;
 }
+
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+	u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
+
+	if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+		inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+		btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
+		btrfs_set_root_flags(root_item, 0);
+		btrfs_set_root_limit(root_item, 0);
+	}
+}
+
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	struct btrfs_root_item *item = &root->root_item;
+	struct timespec ct = CURRENT_TIME;
+
+	spin_lock(&root->root_item_lock);
+	btrfs_set_root_ctransid(item, trans->transid);
+	btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
+	btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
+	spin_unlock(&root->root_item_lock);
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 00000000000..b6d198f5181
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,3489 @@
+/*
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/ratelimit.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "dev-replace.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "raid56.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ *  - In case an unrepairable extent is encountered, track which files are
+ *    affected and report them
+ *  - track and record media errors, throw out bad devices
+ *  - add a mode to also read unallocated space
+ */
+
+struct scrub_block;
+struct scrub_ctx;
+
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
+#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
+
+struct scrub_page {
+	struct scrub_block	*sblock;
+	struct page		*page;
+	struct btrfs_device	*dev;
+	u64			flags;  /* extent flags */
+	u64			generation;
+	u64			logical;
+	u64			physical;
+	u64			physical_for_dev_replace;
+	atomic_t		ref_count;
+	struct {
+		unsigned int	mirror_num:8;
+		unsigned int	have_csum:1;
+		unsigned int	io_error:1;
+	};
+	u8			csum[BTRFS_CSUM_SIZE];
+};
+
+struct scrub_bio {
+	int			index;
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
+	struct bio		*bio;
+	int			err;
+	u64			logical;
+	u64			physical;
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
+	int			page_count;
+	int			next_free;
+	struct btrfs_work	work;
+};
+
+struct scrub_block {
+	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+	int			page_count;
+	atomic_t		outstanding_pages;
+	atomic_t		ref_count; /* free mem on transition to zero */
+	struct scrub_ctx	*sctx;
+	struct {
+		unsigned int	header_error:1;
+		unsigned int	checksum_error:1;
+		unsigned int	no_io_error_seen:1;
+		unsigned int	generation_error:1; /* also sets header_error */
+	};
+};
+
+struct scrub_wr_ctx {
+	struct scrub_bio *wr_curr_bio;
+	struct btrfs_device *tgtdev;
+	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+	atomic_t flush_all_writes;
+	struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
+	struct btrfs_root	*dev_root;
+	int			first_free;
+	int			curr;
+	atomic_t		bios_in_flight;
+	atomic_t		workers_pending;
+	spinlock_t		list_lock;
+	wait_queue_head_t	list_wait;
+	u16			csum_size;
+	struct list_head	csum_list;
+	atomic_t		cancel_req;
+	int			readonly;
+	int			pages_per_rd_bio;
+	u32			sectorsize;
+	u32			nodesize;
+	u32			leafsize;
+
+	int			is_dev_replace;
+	struct scrub_wr_ctx	wr_ctx;
+
+	/*
+	 * statistics
+	 */
+	struct btrfs_scrub_progress stat;
+	spinlock_t		stat_lock;
+};
+
+struct scrub_fixup_nodatasum {
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
+	u64			logical;
+	struct btrfs_root	*root;
+	struct btrfs_work	work;
+	int			mirror_num;
+};
+
+struct scrub_nocow_inode {
+	u64			inum;
+	u64			offset;
+	u64			root;
+	struct list_head	list;
+};
+
+struct scrub_copy_nocow_ctx {
+	struct scrub_ctx	*sctx;
+	u64			logical;
+	u64			len;
+	int			mirror_num;
+	u64			physical_for_dev_replace;
+	struct list_head	inodes;
+	struct btrfs_work	work;
+};
+
+struct scrub_warning {
+	struct btrfs_path	*path;
+	u64			extent_item_size;
+	char			*scratch_buf;
+	char			*msg_buf;
+	const char		*errstr;
+	sector_t		sector;
+	u64			logical;
+	struct btrfs_device	*dev;
+	int			msg_bufsize;
+	int			scratch_bufsize;
+};
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+				     struct btrfs_fs_info *fs_info,
+				     struct scrub_block *original_sblock,
+				     u64 length, u64 logical,
+				     struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+					 struct scrub_block *sblock,
+					 int is_metadata, int have_csum,
+					 const u8 *csum, u64 generation,
+					 u16 csum_size);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+					     struct scrub_block *sblock_good,
+					     int force_write);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+					    struct scrub_block *sblock_good,
+					    int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+	atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+	atomic_dec(&sctx->bios_in_flight);
+	wake_up(&sctx->list_wait);
+}
+
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	while (atomic_read(&fs_info->scrub_pause_req)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+		   atomic_read(&fs_info->scrub_pause_req) == 0);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+}
+
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+	atomic_inc(&fs_info->scrubs_paused);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	mutex_lock(&fs_info->scrub_lock);
+	__scrub_blocked_if_needed(fs_info);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	wake_up(&fs_info->scrub_pause_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	/*
+	 * increment scrubs_running to prevent cancel requests from
+	 * completing as long as a worker is running. we must also
+	 * increment scrubs_paused to prevent deadlocking on pause
+	 * requests used for transactions commits (as the worker uses a
+	 * transaction context). it is safe to regard the worker
+	 * as paused for all matters practical. effectively, we only
+	 * avoid cancellation requests from completing.
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrubs_running);
+	atomic_inc(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	/*
+	 * check if @scrubs_running=@scrubs_paused condition
+	 * inside wait_event() is not an atomic operation.
+	 * which means we may inc/dec @scrub_running/paused
+	 * at any time. Let's wake up @scrub_pause_wait as
+	 * much as we can to let commit transaction blocked less.
+	 */
+	wake_up(&fs_info->scrub_pause_wait);
+
+	atomic_inc(&sctx->workers_pending);
+}
+
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	/*
+	 * see scrub_pending_trans_workers_inc() why we're pretending
+	 * to be paused in the scrub counters
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sctx->workers_pending);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sctx->list_wait);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
+{
+	while (!list_empty(&sctx->csum_list)) {
+		struct btrfs_ordered_sum *sum;
+		sum = list_first_entry(&sctx->csum_list,
+				       struct btrfs_ordered_sum, list);
+		list_del(&sum->list);
+		kfree(sum);
+	}
+}
+
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
+{
+	int i;
+
+	if (!sctx)
+		return;
+
+	scrub_free_wr_ctx(&sctx->wr_ctx);
+
+	/* this can happen when scrub is cancelled */
+	if (sctx->curr != -1) {
+		struct scrub_bio *sbio = sctx->bios[sctx->curr];
+
+		for (i = 0; i < sbio->page_count; i++) {
+			WARN_ON(!sbio->pagev[i]->page);
+			scrub_block_put(sbio->pagev[i]->sblock);
+		}
+		bio_put(sbio->bio);
+	}
+
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+		struct scrub_bio *sbio = sctx->bios[i];
+
+		if (!sbio)
+			break;
+		kfree(sbio);
+	}
+
+	scrub_free_csums(sctx);
+	kfree(sctx);
+}
+
+static noinline_for_stack
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+{
+	struct scrub_ctx *sctx;
+	int		i;
+	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	int pages_per_rd_bio;
+	int ret;
+
+	/*
+	 * the setting of pages_per_rd_bio is correct for scrub but might
+	 * be wrong for the dev_replace code where we might read from
+	 * different devices in the initial huge bios. However, that
+	 * code is able to correctly handle the case when adding a page
+	 * to a bio fails.
+	 */
+	if (dev->bdev)
+		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	else
+		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+	if (!sctx)
+		goto nomem;
+	sctx->is_dev_replace = is_dev_replace;
+	sctx->pages_per_rd_bio = pages_per_rd_bio;
+	sctx->curr = -1;
+	sctx->dev_root = dev->dev_root;
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+		struct scrub_bio *sbio;
+
+		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+		if (!sbio)
+			goto nomem;
+		sctx->bios[i] = sbio;
+
+		sbio->index = i;
+		sbio->sctx = sctx;
+		sbio->page_count = 0;
+		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+				NULL, NULL);
+
+		if (i != SCRUB_BIOS_PER_SCTX - 1)
+			sctx->bios[i]->next_free = i + 1;
+		else
+			sctx->bios[i]->next_free = -1;
+	}
+	sctx->first_free = 0;
+	sctx->nodesize = dev->dev_root->nodesize;
+	sctx->leafsize = dev->dev_root->leafsize;
+	sctx->sectorsize = dev->dev_root->sectorsize;
+	atomic_set(&sctx->bios_in_flight, 0);
+	atomic_set(&sctx->workers_pending, 0);
+	atomic_set(&sctx->cancel_req, 0);
+	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	INIT_LIST_HEAD(&sctx->csum_list);
+
+	spin_lock_init(&sctx->list_lock);
+	spin_lock_init(&sctx->stat_lock);
+	init_waitqueue_head(&sctx->list_wait);
+
+	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+				 fs_info->dev_replace.tgtdev, is_dev_replace);
+	if (ret) {
+		scrub_free_ctx(sctx);
+		return ERR_PTR(ret);
+	}
+	return sctx;
+
+nomem:
+	scrub_free_ctx(sctx);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+				     void *warn_ctx)
+{
+	u64 isize;
+	u32 nlink;
+	int ret;
+	int i;
+	struct extent_buffer *eb;
+	struct btrfs_inode_item *inode_item;
+	struct scrub_warning *swarn = warn_ctx;
+	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_root *local_root;
+	struct btrfs_key root_key;
+
+	root_key.objectid = root;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(local_root)) {
+		ret = PTR_ERR(local_root);
+		goto err;
+	}
+
+	ret = inode_item_info(inum, 0, local_root, swarn->path);
+	if (ret) {
+		btrfs_release_path(swarn->path);
+		goto err;
+	}
+
+	eb = swarn->path->nodes[0];
+	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+					struct btrfs_inode_item);
+	isize = btrfs_inode_size(eb, inode_item);
+	nlink = btrfs_inode_nlink(eb, inode_item);
+	btrfs_release_path(swarn->path);
+
+	ipath = init_ipath(4096, local_root, swarn->path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto err;
+	}
+	ret = paths_from_inode(inum, ipath);
+
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * we deliberately ignore the bit ipath might have been too small to
+	 * hold all of the paths here
+	 */
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
+			"length %llu, links %u (path: %s)\n", swarn->errstr,
+			swarn->logical, rcu_str_deref(swarn->dev->name),
+			(unsigned long long)swarn->sector, root, inum, offset,
+			min(isize - offset, (u64)PAGE_SIZE), nlink,
+			(char *)(unsigned long)ipath->fspath->val[i]);
+
+	free_ipath(ipath);
+	return 0;
+
+err:
+	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+		"resolving failed with ret=%d\n", swarn->errstr,
+		swarn->logical, rcu_str_deref(swarn->dev->name),
+		(unsigned long long)swarn->sector, root, inum, offset, ret);
+
+	free_ipath(ipath);
+	return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+{
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct scrub_warning swarn;
+	unsigned long ptr = 0;
+	u64 extent_item_pos;
+	u64 flags = 0;
+	u64 ref_root;
+	u32 item_size;
+	u8 ref_level;
+	const int bufsize = 4096;
+	int ret;
+
+	WARN_ON(sblock->page_count < 1);
+	dev = sblock->pagev[0]->dev;
+	fs_info = sblock->sctx->dev_root->fs_info;
+
+	path = btrfs_alloc_path();
+
+	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
+	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
+	swarn.sector = (sblock->pagev[0]->physical) >> 9;
+	swarn.logical = sblock->pagev[0]->logical;
+	swarn.errstr = errstr;
+	swarn.dev = NULL;
+	swarn.msg_bufsize = bufsize;
+	swarn.scratch_bufsize = bufsize;
+
+	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
+		goto out;
+
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+				  &flags);
+	if (ret < 0)
+		goto out;
+
+	extent_item_pos = swarn.logical - found_key.objectid;
+	swarn.extent_item_size = found_key.offset;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		do {
+			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+						      item_size, &ref_root,
+						      &ref_level);
+			printk_in_rcu(KERN_WARNING
+				"BTRFS: %s at logical %llu on dev %s, "
+				"sector %llu: metadata %s (level %d) in tree "
+				"%llu\n", errstr, swarn.logical,
+				rcu_str_deref(dev->name),
+				(unsigned long long)swarn.sector,
+				ref_level ? "node" : "leaf",
+				ret < 0 ? -1 : ref_level,
+				ret < 0 ? -1 : ref_root);
+		} while (ret != 1);
+		btrfs_release_path(path);
+	} else {
+		btrfs_release_path(path);
+		swarn.path = path;
+		swarn.dev = dev;
+		iterate_extent_inodes(fs_info, found_key.objectid,
+					extent_item_pos, 1,
+					scrub_print_warning_inode, &swarn);
+	}
+
+out:
+	btrfs_free_path(path);
+	kfree(swarn.scratch_buf);
+	kfree(swarn.msg_buf);
+}
+
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
+{
+	struct page *page = NULL;
+	unsigned long index;
+	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
+	int ret;
+	int corrected = 0;
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	struct btrfs_fs_info *fs_info;
+	u64 end = offset + PAGE_SIZE - 1;
+	struct btrfs_root *local_root;
+	int srcu_index;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = fixup->root->fs_info;
+	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+		return PTR_ERR(local_root);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (PageUptodate(page)) {
+		if (PageDirty(page)) {
+			/*
+			 * we need to write the data to the defect sector. the
+			 * data that was in that sector is not in memory,
+			 * because the page was modified. we must not write the
+			 * modified page to that sector.
+			 *
+			 * TODO: what could be done here: wait for the delalloc
+			 *       runner to write out that page (might involve
+			 *       COW) and see whether the sector is still
+			 *       referenced afterwards.
+			 *
+			 * For the meantime, we'll treat this error
+			 * incorrectable, although there is a chance that a
+			 * later scrub will find the bad sector again and that
+			 * there's no dirty page in memory, then.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+		fs_info = BTRFS_I(inode)->root->fs_info;
+		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
+					fixup->logical, page,
+					fixup->mirror_num);
+		unlock_page(page);
+		corrected = !ret;
+	} else {
+		/*
+		 * we need to get good data first. the general readpage path
+		 * will call repair_io_failure for us, we just have to make
+		 * sure we read the bad mirror.
+		 */
+		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret) {
+			/* set_extent_bits should give proper error */
+			WARN_ON(ret > 0);
+			if (ret > 0)
+				ret = -EFAULT;
+			goto out;
+		}
+
+		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+						btrfs_get_extent,
+						fixup->mirror_num);
+		wait_on_page_locked(page);
+
+		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+						end, EXTENT_DAMAGED, 0, NULL);
+		if (!corrected)
+			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+						EXTENT_DAMAGED, GFP_NOFS);
+	}
+
+out:
+	if (page)
+		put_page(page);
+
+	iput(inode);
+
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0 && corrected) {
+		/*
+		 * we only need to call readpage for one of the inodes belonging
+		 * to this extent. so make iterate_extent_inodes stop
+		 */
+		return 1;
+	}
+
+	return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+	int ret;
+	struct scrub_fixup_nodatasum *fixup;
+	struct scrub_ctx *sctx;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *path;
+	int uncorrectable = 0;
+
+	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+	sctx = fixup->sctx;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.malloc_errors;
+		spin_unlock(&sctx->stat_lock);
+		uncorrectable = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(fixup->root);
+	if (IS_ERR(trans)) {
+		uncorrectable = 1;
+		goto out;
+	}
+
+	/*
+	 * the idea is to trigger a regular read through the standard path. we
+	 * read a page from the (failed) logical address by specifying the
+	 * corresponding copynum of the failed sector. thus, that readpage is
+	 * expected to fail.
+	 * that is the point where on-the-fly error correction will kick in
+	 * (once it's finished) and rewrite the failed sector if a good copy
+	 * can be found.
+	 */
+	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+						path, scrub_fixup_readpage,
+						fixup);
+	if (ret < 0) {
+		uncorrectable = 1;
+		goto out;
+	}
+	WARN_ON(ret != 1);
+
+	spin_lock(&sctx->stat_lock);
+	++sctx->stat.corrected_errors;
+	spin_unlock(&sctx->stat_lock);
+
+out:
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fixup->root);
+	if (uncorrectable) {
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.uncorrectable_errors;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_replace_stats_inc(
+			&sctx->dev_root->fs_info->dev_replace.
+			num_uncorrectable_read_errors);
+		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+			fixup->logical, rcu_str_deref(fixup->dev->name));
+	}
+
+	btrfs_free_path(path);
+	kfree(fixup);
+
+	scrub_pending_trans_workers_dec(sctx);
+}
+
+/*
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
+ */
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
+{
+	struct scrub_ctx *sctx = sblock_to_check->sctx;
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
+	u64 length;
+	u64 logical;
+	u64 generation;
+	unsigned int failed_mirror_index;
+	unsigned int is_metadata;
+	unsigned int have_csum;
+	u8 *csum;
+	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+	struct scrub_block *sblock_bad;
+	int ret;
+	int mirror_index;
+	int page_num;
+	int success;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	BUG_ON(sblock_to_check->page_count < 1);
+	fs_info = sctx->dev_root->fs_info;
+	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		return 0;
+	}
+	length = sblock_to_check->page_count * PAGE_SIZE;
+	logical = sblock_to_check->pagev[0]->logical;
+	generation = sblock_to_check->pagev[0]->generation;
+	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+	is_metadata = !(sblock_to_check->pagev[0]->flags &
+			BTRFS_EXTENT_FLAG_DATA);
+	have_csum = sblock_to_check->pagev[0]->have_csum;
+	csum = sblock_to_check->pagev[0]->csum;
+	dev = sblock_to_check->pagev[0]->dev;
+
+	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+		sblocks_for_recheck = NULL;
+		goto nodatasum_case;
+	}
+
+	/*
+	 * read all mirrors one after the other. This includes to
+	 * re-read the extent or metadata block that failed (that was
+	 * the cause that this fixup code is called) another time,
+	 * page by page this time in order to know which pages
+	 * caused I/O errors and which ones are good (for all mirrors).
+	 * It is the goal to handle the situation when more than one
+	 * mirror contains I/O errors, but the errors do not
+	 * overlap, i.e. the data can be repaired by selecting the
+	 * pages from those mirrors without I/O error on the
+	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+	 * would be that mirror #1 has an I/O error on the first page,
+	 * the second page is good, and mirror #2 has an I/O error on
+	 * the second page, but the first page is good.
+	 * Then the first page of the first mirror can be repaired by
+	 * taking the first page of the second mirror, and the
+	 * second page of the second mirror can be repaired by
+	 * copying the contents of the 2nd page of the 1st mirror.
+	 * One more note: if the pages of one mirror contain I/O
+	 * errors, the checksum cannot be verified. In order to get
+	 * the best data for repairing, the first attempt is to find
+	 * a mirror without I/O errors and with a validated checksum.
+	 * Only if this is not possible, the pages are picked from
+	 * mirrors with I/O errors without considering the checksum.
+	 * If the latter is the case, at the end, the checksum of the
+	 * repaired area is verified in order to correctly maintain
+	 * the statistics.
+	 */
+
+	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+				     sizeof(*sblocks_for_recheck),
+				     GFP_NOFS);
+	if (!sblocks_for_recheck) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+		goto out;
+	}
+
+	/* setup the context, map the logical blocks and alloc the pages */
+	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
+					logical, sblocks_for_recheck);
+	if (ret) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+		goto out;
+	}
+	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+	sblock_bad = sblocks_for_recheck + failed_mirror_index;
+
+	/* build and submit the bios for the failed mirror, check checksums */
+	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+			    csum, generation, sctx->csum_size);
+
+	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+	    sblock_bad->no_io_error_seen) {
+		/*
+		 * the error disappeared after reading page by page, or
+		 * the area was part of a huge bio and other parts of the
+		 * bio caused I/O errors, or the block layer merged several
+		 * read requests into one and the error is caused by a
+		 * different bio (usually one of the two latter cases is
+		 * the cause)
+		 */
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.unverified_errors++;
+		spin_unlock(&sctx->stat_lock);
+
+		if (sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock_bad);
+		goto out;
+	}
+
+	if (!sblock_bad->no_io_error_seen) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("i/o error", sblock_to_check);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+	} else if (sblock_bad->checksum_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.csum_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum error", sblock_to_check);
+		btrfs_dev_stat_inc_and_print(dev,
+					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
+	} else if (sblock_bad->header_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.verify_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum/header error",
+					    sblock_to_check);
+		if (sblock_bad->generation_error)
+			btrfs_dev_stat_inc_and_print(dev,
+				BTRFS_DEV_STAT_GENERATION_ERRS);
+		else
+			btrfs_dev_stat_inc_and_print(dev,
+				BTRFS_DEV_STAT_CORRUPTION_ERRS);
+	}
+
+	if (sctx->readonly) {
+		ASSERT(!sctx->is_dev_replace);
+		goto out;
+	}
+
+	if (!is_metadata && !have_csum) {
+		struct scrub_fixup_nodatasum *fixup_nodatasum;
+
+nodatasum_case:
+		WARN_ON(sctx->is_dev_replace);
+
+		/*
+		 * !is_metadata and !have_csum, this means that the data
+		 * might not be COW'ed, that it might be modified
+		 * concurrently. The general strategy to work on the
+		 * commit root does not help in the case when COW is not
+		 * used.
+		 */
+		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+		if (!fixup_nodatasum)
+			goto did_not_correct_error;
+		fixup_nodatasum->sctx = sctx;
+		fixup_nodatasum->dev = dev;
+		fixup_nodatasum->logical = logical;
+		fixup_nodatasum->root = fs_info->extent_root;
+		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
+		scrub_pending_trans_workers_inc(sctx);
+		btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+				NULL, NULL);
+		btrfs_queue_work(fs_info->scrub_workers,
+				 &fixup_nodatasum->work);
+		goto out;
+	}
+
+	/*
+	 * now build and submit the bios for the other mirrors, check
+	 * checksums.
+	 * First try to pick the mirror which is completely without I/O
+	 * errors and also does not have a checksum error.
+	 * If one is found, and if a checksum is present, the full block
+	 * that is known to contain an error is rewritten. Afterwards
+	 * the block is known to be corrected.
+	 * If a mirror is found which is completely correct, and no
+	 * checksum is present, only those pages are rewritten that had
+	 * an I/O error in the block to be repaired, since it cannot be
+	 * determined, which copy of the other pages is better (and it
+	 * could happen otherwise that a correct page would be
+	 * overwritten by a bad one).
+	 */
+	for (mirror_index = 0;
+	     mirror_index < BTRFS_MAX_MIRRORS &&
+	     sblocks_for_recheck[mirror_index].page_count > 0;
+	     mirror_index++) {
+		struct scrub_block *sblock_other;
+
+		if (mirror_index == failed_mirror_index)
+			continue;
+		sblock_other = sblocks_for_recheck + mirror_index;
+
+		/* build and submit the bios, check checksums */
+		scrub_recheck_block(fs_info, sblock_other, is_metadata,
+				    have_csum, csum, generation,
+				    sctx->csum_size);
+
+		if (!sblock_other->header_error &&
+		    !sblock_other->checksum_error &&
+		    sblock_other->no_io_error_seen) {
+			if (sctx->is_dev_replace) {
+				scrub_write_block_to_dev_replace(sblock_other);
+			} else {
+				int force_write = is_metadata || have_csum;
+
+				ret = scrub_repair_block_from_good_copy(
+						sblock_bad, sblock_other,
+						force_write);
+			}
+			if (0 == ret)
+				goto corrected_error;
+		}
+	}
+
+	/*
+	 * for dev_replace, pick good pages and write to the target device.
+	 */
+	if (sctx->is_dev_replace) {
+		success = 1;
+		for (page_num = 0; page_num < sblock_bad->page_count;
+		     page_num++) {
+			int sub_success;
+
+			sub_success = 0;
+			for (mirror_index = 0;
+			     mirror_index < BTRFS_MAX_MIRRORS &&
+			     sblocks_for_recheck[mirror_index].page_count > 0;
+			     mirror_index++) {
+				struct scrub_block *sblock_other =
+					sblocks_for_recheck + mirror_index;
+				struct scrub_page *page_other =
+					sblock_other->pagev[page_num];
+
+				if (!page_other->io_error) {
+					ret = scrub_write_page_to_dev_replace(
+							sblock_other, page_num);
+					if (ret == 0) {
+						/* succeeded for this page */
+						sub_success = 1;
+						break;
+					} else {
+						btrfs_dev_replace_stats_inc(
+							&sctx->dev_root->
+							fs_info->dev_replace.
+							num_write_errors);
+					}
+				}
+			}
+
+			if (!sub_success) {
+				/*
+				 * did not find a mirror to fetch the page
+				 * from. scrub_write_page_to_dev_replace()
+				 * handles this case (page->io_error), by
+				 * filling the block with zeros before
+				 * submitting the write request
+				 */
+				success = 0;
+				ret = scrub_write_page_to_dev_replace(
+						sblock_bad, page_num);
+				if (ret)
+					btrfs_dev_replace_stats_inc(
+						&sctx->dev_root->fs_info->
+						dev_replace.num_write_errors);
+			}
+		}
+
+		goto out;
+	}
+
+	/*
+	 * for regular scrub, repair those pages that are errored.
+	 * In case of I/O errors in the area that is supposed to be
+	 * repaired, continue by picking good copies of those pages.
+	 * Select the good pages from mirrors to rewrite bad pages from
+	 * the area to fix. Afterwards verify the checksum of the block
+	 * that is supposed to be repaired. This verification step is
+	 * only done for the purpose of statistic counting and for the
+	 * final scrub report, whether errors remain.
+	 * A perfect algorithm could make use of the checksum and try
+	 * all possible combinations of pages from the different mirrors
+	 * until the checksum verification succeeds. For example, when
+	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+	 * of mirror #2 is readable but the final checksum test fails,
+	 * then the 2nd page of mirror #3 could be tried, whether now
+	 * the final checksum succeedes. But this would be a rare
+	 * exception and is therefore not implemented. At least it is
+	 * avoided that the good copy is overwritten.
+	 * A more useful improvement would be to pick the sectors
+	 * without I/O error based on sector sizes (512 bytes on legacy
+	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+	 * mirror could be repaired by taking 512 byte of a different
+	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+	 * area are unreadable.
+	 */
+
+	/* can only fix I/O errors from here on */
+	if (sblock_bad->no_io_error_seen)
+		goto did_not_correct_error;
+
+	success = 1;
+	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+
+		if (!page_bad->io_error)
+			continue;
+
+		for (mirror_index = 0;
+		     mirror_index < BTRFS_MAX_MIRRORS &&
+		     sblocks_for_recheck[mirror_index].page_count > 0;
+		     mirror_index++) {
+			struct scrub_block *sblock_other = sblocks_for_recheck +
+							   mirror_index;
+			struct scrub_page *page_other = sblock_other->pagev[
+							page_num];
+
+			if (!page_other->io_error) {
+				ret = scrub_repair_page_from_good_copy(
+					sblock_bad, sblock_other, page_num, 0);
+				if (0 == ret) {
+					page_bad->io_error = 0;
+					break; /* succeeded for this page */
+				}
+			}
+		}
+
+		if (page_bad->io_error) {
+			/* did not find a mirror to copy the page from */
+			success = 0;
+		}
+	}
+
+	if (success) {
+		if (is_metadata || have_csum) {
+			/*
+			 * need to verify the checksum now that all
+			 * sectors on disk are repaired (the write
+			 * request for data to be repaired is on its way).
+			 * Just be lazy and use scrub_recheck_block()
+			 * which re-reads the data before the checksum
+			 * is verified, but most likely the data comes out
+			 * of the page cache.
+			 */
+			scrub_recheck_block(fs_info, sblock_bad,
+					    is_metadata, have_csum, csum,
+					    generation, sctx->csum_size);
+			if (!sblock_bad->header_error &&
+			    !sblock_bad->checksum_error &&
+			    sblock_bad->no_io_error_seen)
+				goto corrected_error;
+			else
+				goto did_not_correct_error;
+		} else {
+corrected_error:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.corrected_errors++;
+			spin_unlock(&sctx->stat_lock);
+			printk_ratelimited_in_rcu(KERN_ERR
+				"BTRFS: fixed up error at logical %llu on dev %s\n",
+				logical, rcu_str_deref(dev->name));
+		}
+	} else {
+did_not_correct_error:
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		printk_ratelimited_in_rcu(KERN_ERR
+			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+			logical, rcu_str_deref(dev->name));
+	}
+
+out:
+	if (sblocks_for_recheck) {
+		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+		     mirror_index++) {
+			struct scrub_block *sblock = sblocks_for_recheck +
+						     mirror_index;
+			int page_index;
+
+			for (page_index = 0; page_index < sblock->page_count;
+			     page_index++) {
+				sblock->pagev[page_index]->sblock = NULL;
+				scrub_page_put(sblock->pagev[page_index]);
+			}
+		}
+		kfree(sblocks_for_recheck);
+	}
+
+	return 0;
+}
+
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+				     struct btrfs_fs_info *fs_info,
+				     struct scrub_block *original_sblock,
+				     u64 length, u64 logical,
+				     struct scrub_block *sblocks_for_recheck)
+{
+	int page_index;
+	int mirror_index;
+	int ret;
+
+	/*
+	 * note: the two members ref_count and outstanding_pages
+	 * are not used (and not set) in the blocks that are used for
+	 * the recheck procedure
+	 */
+
+	page_index = 0;
+	while (length > 0) {
+		u64 sublen = min_t(u64, length, PAGE_SIZE);
+		u64 mapped_length = sublen;
+		struct btrfs_bio *bbio = NULL;
+
+		/*
+		 * with a length of PAGE_SIZE, each returned stripe
+		 * represents one mirror
+		 */
+		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
+				      &mapped_length, &bbio, 0);
+		if (ret || !bbio || mapped_length < sublen) {
+			kfree(bbio);
+			return -EIO;
+		}
+
+		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+		     mirror_index++) {
+			struct scrub_block *sblock;
+			struct scrub_page *page;
+
+			if (mirror_index >= BTRFS_MAX_MIRRORS)
+				continue;
+
+			sblock = sblocks_for_recheck + mirror_index;
+			sblock->sctx = sctx;
+			page = kzalloc(sizeof(*page), GFP_NOFS);
+			if (!page) {
+leave_nomem:
+				spin_lock(&sctx->stat_lock);
+				sctx->stat.malloc_errors++;
+				spin_unlock(&sctx->stat_lock);
+				kfree(bbio);
+				return -ENOMEM;
+			}
+			scrub_page_get(page);
+			sblock->pagev[page_index] = page;
+			page->logical = logical;
+			page->physical = bbio->stripes[mirror_index].physical;
+			BUG_ON(page_index >= original_sblock->page_count);
+			page->physical_for_dev_replace =
+				original_sblock->pagev[page_index]->
+				physical_for_dev_replace;
+			/* for missing devices, dev->bdev is NULL */
+			page->dev = bbio->stripes[mirror_index].dev;
+			page->mirror_num = mirror_index + 1;
+			sblock->page_count++;
+			page->page = alloc_page(GFP_NOFS);
+			if (!page->page)
+				goto leave_nomem;
+		}
+		kfree(bbio);
+		length -= sublen;
+		logical += sublen;
+		page_index++;
+	}
+
+	return 0;
+}
+
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size)
+{
+	int page_num;
+
+	sblock->no_io_error_seen = 1;
+	sblock->header_error = 0;
+	sblock->checksum_error = 0;
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		struct bio *bio;
+		struct scrub_page *page = sblock->pagev[page_num];
+
+		if (page->dev->bdev == NULL) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
+			continue;
+		}
+
+		WARN_ON(!page->page);
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		if (!bio) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
+			continue;
+		}
+		bio->bi_bdev = page->dev->bdev;
+		bio->bi_iter.bi_sector = page->physical >> 9;
+
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
+		if (btrfsic_submit_bio_wait(READ, bio))
+			sblock->no_io_error_seen = 0;
+
+		bio_put(bio);
+	}
+
+	if (sblock->no_io_error_seen)
+		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+					     have_csum, csum, generation,
+					     csum_size);
+
+	return;
+}
+
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+					 struct scrub_block *sblock,
+					 int is_metadata, int have_csum,
+					 const u8 *csum, u64 generation,
+					 u16 csum_size)
+{
+	int page_num;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	void *mapped_buffer;
+
+	WARN_ON(!sblock->pagev[0]->page);
+	if (is_metadata) {
+		struct btrfs_header *h;
+
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+		h = (struct btrfs_header *)mapped_buffer;
+
+		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
+		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+			   BTRFS_UUID_SIZE)) {
+			sblock->header_error = 1;
+		} else if (generation != btrfs_stack_header_generation(h)) {
+			sblock->header_error = 1;
+			sblock->generation_error = 1;
+		}
+		csum = h->csum;
+	} else {
+		if (!have_csum)
+			return;
+
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+	}
+
+	for (page_num = 0;;) {
+		if (page_num == 0 && is_metadata)
+			crc = btrfs_csum_data(
+				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+		else
+			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
+
+		kunmap_atomic(mapped_buffer);
+		page_num++;
+		if (page_num >= sblock->page_count)
+			break;
+		WARN_ON(!sblock->pagev[page_num]->page);
+
+		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, csum, csum_size))
+		sblock->checksum_error = 1;
+}
+
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+					     struct scrub_block *sblock_good,
+					     int force_write)
+{
+	int page_num;
+	int ret = 0;
+
+	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+		int ret_sub;
+
+		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+							   sblock_good,
+							   page_num,
+							   force_write);
+		if (ret_sub)
+			ret = ret_sub;
+	}
+
+	return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+					    struct scrub_block *sblock_good,
+					    int page_num, int force_write)
+{
+	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+	struct scrub_page *page_good = sblock_good->pagev[page_num];
+
+	BUG_ON(page_bad->page == NULL);
+	BUG_ON(page_good->page == NULL);
+	if (force_write || sblock_bad->header_error ||
+	    sblock_bad->checksum_error || page_bad->io_error) {
+		struct bio *bio;
+		int ret;
+
+		if (!page_bad->dev->bdev) {
+			printk_ratelimited(KERN_WARNING "BTRFS: "
+				"scrub_repair_page_from_good_copy(bdev == NULL) "
+				"is unexpected!\n");
+			return -EIO;
+		}
+
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		if (!bio)
+			return -EIO;
+		bio->bi_bdev = page_bad->dev->bdev;
+		bio->bi_iter.bi_sector = page_bad->physical >> 9;
+
+		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+		if (PAGE_SIZE != ret) {
+			bio_put(bio);
+			return -EIO;
+		}
+
+		if (btrfsic_submit_bio_wait(WRITE, bio)) {
+			btrfs_dev_stat_inc_and_print(page_bad->dev,
+				BTRFS_DEV_STAT_WRITE_ERRS);
+			btrfs_dev_replace_stats_inc(
+				&sblock_bad->sctx->dev_root->fs_info->
+				dev_replace.num_write_errors);
+			bio_put(bio);
+			return -EIO;
+		}
+		bio_put(bio);
+	}
+
+	return 0;
+}
+
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+	int page_num;
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		int ret;
+
+		ret = scrub_write_page_to_dev_replace(sblock, page_num);
+		if (ret)
+			btrfs_dev_replace_stats_inc(
+				&sblock->sctx->dev_root->fs_info->dev_replace.
+				num_write_errors);
+	}
+}
+
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num)
+{
+	struct scrub_page *spage = sblock->pagev[page_num];
+
+	BUG_ON(spage->page == NULL);
+	if (spage->io_error) {
+		void *mapped_buffer = kmap_atomic(spage->page);
+
+		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+		flush_dcache_page(spage->page);
+		kunmap_atomic(mapped_buffer);
+	}
+	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+	int ret;
+
+	mutex_lock(&wr_ctx->wr_lock);
+again:
+	if (!wr_ctx->wr_curr_bio) {
+		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+					      GFP_NOFS);
+		if (!wr_ctx->wr_curr_bio) {
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -ENOMEM;
+		}
+		wr_ctx->wr_curr_bio->sctx = sctx;
+		wr_ctx->wr_curr_bio->page_count = 0;
+	}
+	sbio = wr_ctx->wr_curr_bio;
+	if (sbio->page_count == 0) {
+		struct bio *bio;
+
+		sbio->physical = spage->physical_for_dev_replace;
+		sbio->logical = spage->logical;
+		sbio->dev = wr_ctx->tgtdev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+			if (!bio) {
+				mutex_unlock(&wr_ctx->wr_lock);
+				return -ENOMEM;
+			}
+			sbio->bio = bio;
+		}
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_wr_bio_end_io;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_iter.bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical_for_dev_replace ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical) {
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -EIO;
+		}
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	sbio->pagev[sbio->page_count] = spage;
+	scrub_page_get(spage);
+	sbio->page_count++;
+	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+		scrub_wr_submit(sctx);
+	mutex_unlock(&wr_ctx->wr_lock);
+
+	return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+
+	if (!wr_ctx->wr_curr_bio)
+		return;
+
+	sbio = wr_ctx->wr_curr_bio;
+	wr_ctx->wr_curr_bio = NULL;
+	WARN_ON(!sbio->bio->bi_bdev);
+	scrub_pending_bio_inc(sctx);
+	/* process all writes in a single worker thread. Then the block layer
+	 * orders the requests before sending them to the driver which
+	 * doubled the write performance on spinning disks when measured
+	 * with Linux 3.5 */
+	btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_ctx *sctx = sbio->sctx;
+	int i;
+
+	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+	if (sbio->err) {
+		struct btrfs_dev_replace *dev_replace =
+			&sbio->sctx->dev_root->fs_info->dev_replace;
+
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			btrfs_dev_replace_stats_inc(&dev_replace->
+						    num_write_errors);
+		}
+	}
+
+	for (i = 0; i < sbio->page_count; i++)
+		scrub_page_put(sbio->pagev[i]);
+
+	bio_put(sbio->bio);
+	kfree(sbio);
+	scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
+{
+	u64 flags;
+	int ret;
+
+	WARN_ON(sblock->page_count < 1);
+	flags = sblock->pagev[0]->flags;
+	ret = 0;
+	if (flags & BTRFS_EXTENT_FLAG_DATA)
+		ret = scrub_checksum_data(sblock);
+	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = scrub_checksum_tree_block(sblock);
+	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+		(void)scrub_checksum_super(sblock);
+	else
+		WARN_ON(1);
+	if (ret)
+		scrub_handle_errored_block(sblock);
+
+	return ret;
+}
+
+static int scrub_checksum_data(struct scrub_block *sblock)
+{
+	struct scrub_ctx *sctx = sblock->sctx;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u8 *on_disk_csum;
+	struct page *page;
+	void *buffer;
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	if (!sblock->pagev[0]->have_csum)
+		return 0;
+
+	on_disk_csum = sblock->pagev[0]->csum;
+	page = sblock->pagev[0]->page;
+	buffer = kmap_atomic(page);
+
+	len = sctx->sectorsize;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		crc = btrfs_csum_data(buffer, crc, l);
+		kunmap_atomic(buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		buffer = kmap_atomic(page);
+	}
+
+	btrfs_csum_final(crc, csum);
+	if (memcmp(csum, on_disk_csum, sctx->csum_size))
+		fail = 1;
+
+	return fail;
+}
+
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
+{
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_header *h;
+	struct btrfs_root *root = sctx->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+	struct page *page;
+	void *mapped_buffer;
+	u64 mapped_size;
+	void *p;
+	u32 crc = ~(u32)0;
+	int fail = 0;
+	int crc_fail = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	page = sblock->pagev[0]->page;
+	mapped_buffer = kmap_atomic(page);
+	h = (struct btrfs_header *)mapped_buffer;
+	memcpy(on_disk_csum, h->csum, sctx->csum_size);
+
+	/*
+	 * we don't use the getter functions here, as we
+	 * a) don't have an extent buffer and
+	 * b) the page is already kmapped
+	 */
+
+	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+		++fail;
+
+	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
+		++fail;
+
+	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+		++fail;
+
+	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+		   BTRFS_UUID_SIZE))
+		++fail;
+
+	WARN_ON(sctx->nodesize != sctx->leafsize);
+	len = sctx->nodesize - BTRFS_CSUM_SIZE;
+	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, mapped_size);
+
+		crc = btrfs_csum_data(p, crc, l);
+		kunmap_atomic(mapped_buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		mapped_buffer = kmap_atomic(page);
+		mapped_size = PAGE_SIZE;
+		p = mapped_buffer;
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+		++crc_fail;
+
+	return fail || crc_fail;
+}
+
+static int scrub_checksum_super(struct scrub_block *sblock)
+{
+	struct btrfs_super_block *s;
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_root *root = sctx->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+	struct page *page;
+	void *mapped_buffer;
+	u64 mapped_size;
+	void *p;
+	u32 crc = ~(u32)0;
+	int fail_gen = 0;
+	int fail_cor = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	page = sblock->pagev[0]->page;
+	mapped_buffer = kmap_atomic(page);
+	s = (struct btrfs_super_block *)mapped_buffer;
+	memcpy(on_disk_csum, s->csum, sctx->csum_size);
+
+	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+		++fail_cor;
+
+	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+		++fail_gen;
+
+	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+		++fail_cor;
+
+	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, mapped_size);
+
+		crc = btrfs_csum_data(p, crc, l);
+		kunmap_atomic(mapped_buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		mapped_buffer = kmap_atomic(page);
+		mapped_size = PAGE_SIZE;
+		p = mapped_buffer;
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+		++fail_cor;
+
+	if (fail_cor + fail_gen) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		if (fail_cor)
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+				BTRFS_DEV_STAT_CORRUPTION_ERRS);
+		else
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+				BTRFS_DEV_STAT_GENERATION_ERRS);
+	}
+
+	return fail_cor + fail_gen;
+}
+
+static void scrub_block_get(struct scrub_block *sblock)
+{
+	atomic_inc(&sblock->ref_count);
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+	if (atomic_dec_and_test(&sblock->ref_count)) {
+		int i;
+
+		for (i = 0; i < sblock->page_count; i++)
+			scrub_page_put(sblock->pagev[i]);
+		kfree(sblock);
+	}
+}
+
+static void scrub_page_get(struct scrub_page *spage)
+{
+	atomic_inc(&spage->ref_count);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+	if (atomic_dec_and_test(&spage->ref_count)) {
+		if (spage->page)
+			__free_page(spage->page);
+		kfree(spage);
+	}
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
+{
+	struct scrub_bio *sbio;
+
+	if (sctx->curr == -1)
+		return;
+
+	sbio = sctx->bios[sctx->curr];
+	sctx->curr = -1;
+	scrub_pending_bio_inc(sctx);
+
+	if (!sbio->bio->bi_bdev) {
+		/*
+		 * this case should not happen. If btrfs_map_block() is
+		 * wrong, it could happen for dev-replace operations on
+		 * missing devices when no mirrors are available, but in
+		 * this case it should already fail the mount.
+		 * This case is handled correctly (but _very_ slowly).
+		 */
+		printk_ratelimited(KERN_WARNING
+			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
+		bio_endio(sbio->bio, -EIO);
+	} else {
+		btrfsic_submit_bio(READ, sbio->bio);
+	}
+}
+
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_block *sblock = spage->sblock;
+	struct scrub_bio *sbio;
+	int ret;
+
+again:
+	/*
+	 * grab a fresh bio or wait for one to become available
+	 */
+	while (sctx->curr == -1) {
+		spin_lock(&sctx->list_lock);
+		sctx->curr = sctx->first_free;
+		if (sctx->curr != -1) {
+			sctx->first_free = sctx->bios[sctx->curr]->next_free;
+			sctx->bios[sctx->curr]->next_free = -1;
+			sctx->bios[sctx->curr]->page_count = 0;
+			spin_unlock(&sctx->list_lock);
+		} else {
+			spin_unlock(&sctx->list_lock);
+			wait_event(sctx->list_wait, sctx->first_free != -1);
+		}
+	}
+	sbio = sctx->bios[sctx->curr];
+	if (sbio->page_count == 0) {
+		struct bio *bio;
+
+		sbio->physical = spage->physical;
+		sbio->logical = spage->logical;
+		sbio->dev = spage->dev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+			if (!bio)
+				return -ENOMEM;
+			sbio->bio = bio;
+		}
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_bio_end_io;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_iter.bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical ||
+		   sbio->dev != spage->dev) {
+		scrub_submit(sctx);
+		goto again;
+	}
+
+	sbio->pagev[sbio->page_count] = spage;
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			return -EIO;
+		}
+		scrub_submit(sctx);
+		goto again;
+	}
+
+	scrub_block_get(sblock); /* one for the page added to the bio */
+	atomic_inc(&sblock->outstanding_pages);
+	sbio->page_count++;
+	if (sbio->page_count == sctx->pages_per_rd_bio)
+		scrub_submit(sctx);
+
+	return 0;
+}
+
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace)
+{
+	struct scrub_block *sblock;
+	int index;
+
+	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+	if (!sblock) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	/* one ref inside this function, plus one for each page added to
+	 * a bio later on */
+	atomic_set(&sblock->ref_count, 1);
+	sblock->sctx = sctx;
+	sblock->no_io_error_seen = 1;
+
+	for (index = 0; len > 0; index++) {
+		struct scrub_page *spage;
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
+		if (!spage) {
+leave_nomem:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.malloc_errors++;
+			spin_unlock(&sctx->stat_lock);
+			scrub_block_put(sblock);
+			return -ENOMEM;
+		}
+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		scrub_page_get(spage);
+		sblock->pagev[index] = spage;
+		spage->sblock = sblock;
+		spage->dev = dev;
+		spage->flags = flags;
+		spage->generation = gen;
+		spage->logical = logical;
+		spage->physical = physical;
+		spage->physical_for_dev_replace = physical_for_dev_replace;
+		spage->mirror_num = mirror_num;
+		if (csum) {
+			spage->have_csum = 1;
+			memcpy(spage->csum, csum, sctx->csum_size);
+		} else {
+			spage->have_csum = 0;
+		}
+		sblock->page_count++;
+		spage->page = alloc_page(GFP_NOFS);
+		if (!spage->page)
+			goto leave_nomem;
+		len -= l;
+		logical += l;
+		physical += l;
+		physical_for_dev_replace += l;
+	}
+
+	WARN_ON(sblock->page_count == 0);
+	for (index = 0; index < sblock->page_count; index++) {
+		struct scrub_page *spage = sblock->pagev[index];
+		int ret;
+
+		ret = scrub_add_page_to_rd_bio(sctx, spage);
+		if (ret) {
+			scrub_block_put(sblock);
+			return ret;
+		}
+	}
+
+	if (force)
+		scrub_submit(sctx);
+
+	/* last one frees, either here or in bio completion for last page */
+	scrub_block_put(sblock);
+	return 0;
+}
+
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_ctx *sctx = sbio->sctx;
+	int i;
+
+	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+	if (sbio->err) {
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			spage->sblock->no_io_error_seen = 0;
+		}
+	}
+
+	/* now complete the scrub_block items that have all pages completed */
+	for (i = 0; i < sbio->page_count; i++) {
+		struct scrub_page *spage = sbio->pagev[i];
+		struct scrub_block *sblock = spage->sblock;
+
+		if (atomic_dec_and_test(&sblock->outstanding_pages))
+			scrub_block_complete(sblock);
+		scrub_block_put(sblock);
+	}
+
+	bio_put(sbio->bio);
+	sbio->bio = NULL;
+	spin_lock(&sctx->list_lock);
+	sbio->next_free = sctx->first_free;
+	sctx->first_free = sbio->index;
+	spin_unlock(&sctx->list_lock);
+
+	if (sctx->is_dev_replace &&
+	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+	}
+
+	scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+	if (!sblock->no_io_error_seen) {
+		scrub_handle_errored_block(sblock);
+	} else {
+		/*
+		 * if has checksum error, write via repair mechanism in
+		 * dev replace case, otherwise write here in dev replace
+		 * case.
+		 */
+		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock);
+	}
+}
+
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
+			   u8 *csum)
+{
+	struct btrfs_ordered_sum *sum = NULL;
+	unsigned long index;
+	unsigned long num_sectors;
+
+	while (!list_empty(&sctx->csum_list)) {
+		sum = list_first_entry(&sctx->csum_list,
+				       struct btrfs_ordered_sum, list);
+		if (sum->bytenr > logical)
+			return 0;
+		if (sum->bytenr + sum->len > logical)
+			break;
+
+		++sctx->stat.csum_discards;
+		list_del(&sum->list);
+		kfree(sum);
+		sum = NULL;
+	}
+	if (!sum)
+		return 0;
+
+	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
+	num_sectors = sum->len / sctx->sectorsize;
+	memcpy(csum, sum->sums + index, sctx->csum_size);
+	if (index == num_sectors - 1) {
+		list_del(&sum->list);
+		kfree(sum);
+	}
+	return 1;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+			u64 physical, struct btrfs_device *dev, u64 flags,
+			u64 gen, int mirror_num, u64 physical_for_dev_replace)
+{
+	int ret;
+	u8 csum[BTRFS_CSUM_SIZE];
+	u32 blocksize;
+
+	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+		blocksize = sctx->sectorsize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.data_extents_scrubbed++;
+		sctx->stat.data_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
+	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		WARN_ON(sctx->nodesize != sctx->leafsize);
+		blocksize = sctx->nodesize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.tree_extents_scrubbed++;
+		sctx->stat.tree_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
+	} else {
+		blocksize = sctx->sectorsize;
+		WARN_ON(1);
+	}
+
+	while (len) {
+		u64 l = min_t(u64, len, blocksize);
+		int have_csum = 0;
+
+		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+			/* push csums to sbio */
+			have_csum = scrub_find_csum(sctx, logical, l, csum);
+			if (have_csum == 0)
+				++sctx->stat.no_csum;
+			if (sctx->is_dev_replace && !have_csum) {
+				ret = copy_nocow_pages(sctx, logical, l,
+						       mirror_num,
+						      physical_for_dev_replace);
+				goto behind_scrub_pages;
+			}
+		}
+		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+				  mirror_num, have_csum ? csum : NULL, 0,
+				  physical_for_dev_replace);
+behind_scrub_pages:
+		if (ret)
+			return ret;
+		len -= l;
+		logical += l;
+		physical += l;
+		physical_for_dev_replace += l;
+	}
+	return 0;
+}
+
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+				   struct map_lookup *map, u64 *offset)
+{
+	int i;
+	int j = 0;
+	u64 stripe_nr;
+	u64 last_offset;
+	int stripe_index;
+	int rot;
+
+	last_offset = (physical - map->stripes[num].physical) *
+		      nr_data_stripes(map);
+	*offset = last_offset;
+	for (i = 0; i < nr_data_stripes(map); i++) {
+		*offset = last_offset + i * map->stripe_len;
+
+		stripe_nr = *offset;
+		do_div(stripe_nr, map->stripe_len);
+		do_div(stripe_nr, nr_data_stripes(map));
+
+		/* Work out the disk rotation on this stripe-set */
+		rot = do_div(stripe_nr, map->num_stripes);
+		/* calculate which stripe this data locates */
+		rot += i;
+		stripe_index = rot % map->num_stripes;
+		if (stripe_index == num)
+			return 0;
+		if (stripe_index < num)
+			j++;
+	}
+	*offset = last_offset + j * map->stripe_len;
+	return 1;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+					   struct map_lookup *map,
+					   struct btrfs_device *scrub_dev,
+					   int num, u64 base, u64 length,
+					   int is_dev_replace)
+{
+	struct btrfs_path *path;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *csum_root = fs_info->csum_root;
+	struct btrfs_extent_item *extent;
+	struct blk_plug plug;
+	u64 flags;
+	int ret;
+	int slot;
+	u64 nstripes;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	u64 physical;
+	u64 logical;
+	u64 logic_end;
+	u64 physical_end;
+	u64 generation;
+	int mirror_num;
+	struct reada_control *reada1;
+	struct reada_control *reada2;
+	struct btrfs_key key_start;
+	struct btrfs_key key_end;
+	u64 increment = map->stripe_len;
+	u64 offset;
+	u64 extent_logical;
+	u64 extent_physical;
+	u64 extent_len;
+	struct btrfs_device *extent_dev;
+	int extent_mirror_num;
+	int stop_loop = 0;
+
+	nstripes = length;
+	physical = map->stripes[num].physical;
+	offset = 0;
+	do_div(nstripes, map->stripe_len);
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		offset = map->stripe_len * num;
+		increment = map->stripe_len * map->num_stripes;
+		mirror_num = 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+		offset = map->stripe_len * (num / map->sub_stripes);
+		increment = map->stripe_len * factor;
+		mirror_num = num % map->sub_stripes + 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		increment = map->stripe_len;
+		mirror_num = num % map->num_stripes + 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		increment = map->stripe_len;
+		mirror_num = num % map->num_stripes + 1;
+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6)) {
+		get_raid56_logic_offset(physical, num, map, &offset);
+		increment = map->stripe_len * nr_data_stripes(map);
+		mirror_num = 1;
+	} else {
+		increment = map->stripe_len;
+		mirror_num = 1;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * work on commit root. The related disk blocks are static as
+	 * long as COW is applied. This means, it is save to rewrite
+	 * them to repair disk errors without any race conditions
+	 */
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	/*
+	 * trigger the readahead for extent tree csum tree and wait for
+	 * completion. During readahead, the scrub is officially paused
+	 * to not hold off transaction commits
+	 */
+	logical = base + offset;
+	physical_end = physical + nstripes * map->stripe_len;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6)) {
+		get_raid56_logic_offset(physical_end, num,
+					map, &logic_end);
+		logic_end += base;
+	} else {
+		logic_end = logical + increment * nstripes;
+	}
+	wait_event(sctx->list_wait,
+		   atomic_read(&sctx->bios_in_flight) == 0);
+	scrub_blocked_if_needed(fs_info);
+
+	/* FIXME it might be better to start readahead at commit root */
+	key_start.objectid = logical;
+	key_start.type = BTRFS_EXTENT_ITEM_KEY;
+	key_start.offset = (u64)0;
+	key_end.objectid = logic_end;
+	key_end.type = BTRFS_METADATA_ITEM_KEY;
+	key_end.offset = (u64)-1;
+	reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_start.type = BTRFS_EXTENT_CSUM_KEY;
+	key_start.offset = logical;
+	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_end.type = BTRFS_EXTENT_CSUM_KEY;
+	key_end.offset = logic_end;
+	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+	if (!IS_ERR(reada1))
+		btrfs_reada_wait(reada1);
+	if (!IS_ERR(reada2))
+		btrfs_reada_wait(reada2);
+
+
+	/*
+	 * collect all data csums for the stripe to avoid seeking during
+	 * the scrub. This might currently (crc32) end up to be about 1MB
+	 */
+	blk_start_plug(&plug);
+
+	/*
+	 * now find all extents for each stripe and scrub them
+	 */
+	ret = 0;
+	while (physical < physical_end) {
+		/* for raid56, we skip parity stripe */
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6)) {
+			ret = get_raid56_logic_offset(physical, num,
+					map, &logical);
+			logical += base;
+			if (ret)
+				goto skip;
+		}
+		/*
+		 * canceled?
+		 */
+		if (atomic_read(&fs_info->scrub_cancel_req) ||
+		    atomic_read(&sctx->cancel_req)) {
+			ret = -ECANCELED;
+			goto out;
+		}
+		/*
+		 * check to see if we have to pause
+		 */
+		if (atomic_read(&fs_info->scrub_pause_req)) {
+			/* push queued extents */
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+			scrub_submit(sctx);
+			mutex_lock(&sctx->wr_ctx.wr_lock);
+			scrub_wr_submit(sctx);
+			mutex_unlock(&sctx->wr_ctx.wr_lock);
+			wait_event(sctx->list_wait,
+				   atomic_read(&sctx->bios_in_flight) == 0);
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+			scrub_blocked_if_needed(fs_info);
+		}
+
+		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+			key.type = BTRFS_METADATA_ITEM_KEY;
+		else
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.objectid = logical;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		if (ret > 0) {
+			ret = btrfs_previous_extent_item(root, path, 0);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				/* there's no smaller item, so stick with the
+				 * larger one */
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(NULL, root, &key,
+							path, 0, 0);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		stop_loop = 0;
+		while (1) {
+			u64 bytes;
+
+			l = path->nodes[0];
+			slot = path->slots[0];
+			if (slot >= btrfs_header_nritems(l)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret == 0)
+					continue;
+				if (ret < 0)
+					goto out;
+
+				stop_loop = 1;
+				break;
+			}
+			btrfs_item_key_to_cpu(l, &key, slot);
+
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				bytes = root->leafsize;
+			else
+				bytes = key.offset;
+
+			if (key.objectid + bytes <= logical)
+				goto next;
+
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
+				goto next;
+
+			if (key.objectid >= logical + map->stripe_len) {
+				/* out of this device extent */
+				if (key.objectid >= logic_end)
+					stop_loop = 1;
+				break;
+			}
+
+			extent = btrfs_item_ptr(l, slot,
+						struct btrfs_extent_item);
+			flags = btrfs_extent_flags(l, extent);
+			generation = btrfs_extent_generation(l, extent);
+
+			if (key.objectid < logical &&
+			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+				btrfs_err(fs_info,
+					   "scrub: tree block %llu spanning "
+					   "stripes, ignored. logical=%llu",
+				       key.objectid, logical);
+				goto next;
+			}
+
+again:
+			extent_logical = key.objectid;
+			extent_len = bytes;
+
+			/*
+			 * trim extent to this stripe
+			 */
+			if (extent_logical < logical) {
+				extent_len -= logical - extent_logical;
+				extent_logical = logical;
+			}
+			if (extent_logical + extent_len >
+			    logical + map->stripe_len) {
+				extent_len = logical + map->stripe_len -
+					     extent_logical;
+			}
+
+			extent_physical = extent_logical - logical + physical;
+			extent_dev = scrub_dev;
+			extent_mirror_num = mirror_num;
+			if (is_dev_replace)
+				scrub_remap_extent(fs_info, extent_logical,
+						   extent_len, &extent_physical,
+						   &extent_dev,
+						   &extent_mirror_num);
+
+			ret = btrfs_lookup_csums_range(csum_root, logical,
+						logical + map->stripe_len - 1,
+						&sctx->csum_list, 1);
+			if (ret)
+				goto out;
+
+			ret = scrub_extent(sctx, extent_logical, extent_len,
+					   extent_physical, extent_dev, flags,
+					   generation, extent_mirror_num,
+					   extent_logical - logical + physical);
+			if (ret)
+				goto out;
+
+			scrub_free_csums(sctx);
+			if (extent_logical + extent_len <
+			    key.objectid + bytes) {
+				if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+					BTRFS_BLOCK_GROUP_RAID6)) {
+					/*
+					 * loop until we find next data stripe
+					 * or we have finished all stripes.
+					 */
+					do {
+						physical += map->stripe_len;
+						ret = get_raid56_logic_offset(
+								physical, num,
+								map, &logical);
+						logical += base;
+					} while (physical < physical_end && ret);
+				} else {
+					physical += map->stripe_len;
+					logical += increment;
+				}
+				if (logical < key.objectid + bytes) {
+					cond_resched();
+					goto again;
+				}
+
+				if (physical >= physical_end) {
+					stop_loop = 1;
+					break;
+				}
+			}
+next:
+			path->slots[0]++;
+		}
+		btrfs_release_path(path);
+skip:
+		logical += increment;
+		physical += map->stripe_len;
+		spin_lock(&sctx->stat_lock);
+		if (stop_loop)
+			sctx->stat.last_physical = map->stripes[num].physical +
+						   length;
+		else
+			sctx->stat.last_physical = physical;
+		spin_unlock(&sctx->stat_lock);
+		if (stop_loop)
+			break;
+	}
+out:
+	/* push queued extents */
+	scrub_submit(sctx);
+	mutex_lock(&sctx->wr_ctx.wr_lock);
+	scrub_wr_submit(sctx);
+	mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+	blk_finish_plug(&plug);
+	btrfs_free_path(path);
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+					  struct btrfs_device *scrub_dev,
+					  u64 chunk_tree, u64 chunk_objectid,
+					  u64 chunk_offset, u64 length,
+					  u64 dev_offset, int is_dev_replace)
+{
+	struct btrfs_mapping_tree *map_tree =
+		&sctx->dev_root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	int i;
+	int ret = 0;
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	read_unlock(&map_tree->map_tree.lock);
+
+	if (!em)
+		return -EINVAL;
+
+	map = (struct map_lookup *)em->bdev;
+	if (em->start != chunk_offset)
+		goto out;
+
+	if (em->len < length)
+		goto out;
+
+	for (i = 0; i < map->num_stripes; ++i) {
+		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
+		    map->stripes[i].physical == dev_offset) {
+			ret = scrub_stripe(sctx, map, scrub_dev, i,
+					   chunk_offset, length,
+					   is_dev_replace);
+			if (ret)
+				goto out;
+		}
+	}
+out:
+	free_extent_map(em);
+
+	return ret;
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+			   struct btrfs_device *scrub_dev, u64 start, u64 end,
+			   int is_dev_replace)
+{
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	struct btrfs_root *root = sctx->dev_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	key.objectid = scrub_dev->devid;
+	key.offset = 0ull;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] >=
+			    btrfs_header_nritems(path->nodes[0])) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.objectid != scrub_dev->devid)
+			break;
+
+		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+			break;
+
+		if (found_key.offset >= end)
+			break;
+
+		if (found_key.offset < key.offset)
+			break;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (found_key.offset + length <= start)
+			goto skip;
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+		/*
+		 * get a reference on the corresponding block group to prevent
+		 * the chunk from going away while we scrub it
+		 */
+		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+
+		/* some chunks are removed but not committed to disk yet,
+		 * continue scrubbing */
+		if (!cache)
+			goto skip;
+
+		dev_replace->cursor_right = found_key.offset + length;
+		dev_replace->cursor_left = found_key.offset;
+		dev_replace->item_needs_writeback = 1;
+		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+				  chunk_offset, length, found_key.offset,
+				  is_dev_replace);
+
+		/*
+		 * flush, submit all pending read and write bios, afterwards
+		 * wait for them.
+		 * Note that in the dev replace case, a read request causes
+		 * write requests that are submitted in the read completion
+		 * worker. Therefore in the current situation, it is required
+		 * that all write requests are flushed, so that all read and
+		 * write requests are really completed when bios_in_flight
+		 * changes to 0.
+		 */
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+		scrub_submit(sctx);
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->bios_in_flight) == 0);
+		atomic_inc(&fs_info->scrubs_paused);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		/*
+		 * must be called before we decrease @scrub_paused.
+		 * make sure we don't block transaction commit while
+		 * we are waiting pending workers finished.
+		 */
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->workers_pending) == 0);
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+		mutex_lock(&fs_info->scrub_lock);
+		__scrub_blocked_if_needed(fs_info);
+		atomic_dec(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		btrfs_put_block_group(cache);
+		if (ret)
+			break;
+		if (is_dev_replace &&
+		    atomic64_read(&dev_replace->num_write_errors) > 0) {
+			ret = -EIO;
+			break;
+		}
+		if (sctx->stat.malloc_errors > 0) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		dev_replace->cursor_left = dev_replace->cursor_right;
+		dev_replace->item_needs_writeback = 1;
+skip:
+		key.offset = found_key.offset + length;
+		btrfs_release_path(path);
+	}
+
+	btrfs_free_path(path);
+
+	/*
+	 * ret can still be 1 from search_slot or next_leaf,
+	 * that's not an error
+	 */
+	return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+					   struct btrfs_device *scrub_dev)
+{
+	int	i;
+	u64	bytenr;
+	u64	gen;
+	int	ret;
+	struct btrfs_root *root = sctx->dev_root;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		return -EIO;
+
+	gen = root->fs_info->last_trans_committed;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
+			break;
+
+		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+				  NULL, 1, bytenr);
+		if (ret)
+			return ret;
+	}
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+	return 0;
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+						int is_dev_replace)
+{
+	int ret = 0;
+	int flags = WQ_FREEZABLE | WQ_UNBOUND;
+	int max_active = fs_info->thread_pool_size;
+
+	if (fs_info->scrub_workers_refcnt == 0) {
+		if (is_dev_replace)
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      1, 4);
+		else
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      max_active, 4);
+		if (!fs_info->scrub_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		fs_info->scrub_wr_completion_workers =
+			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+					      max_active, 2);
+		if (!fs_info->scrub_wr_completion_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		fs_info->scrub_nocow_workers =
+			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+		if (!fs_info->scrub_nocow_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+	++fs_info->scrub_workers_refcnt;
+out:
+	return ret;
+}
+
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+	if (--fs_info->scrub_workers_refcnt == 0) {
+		btrfs_destroy_workqueue(fs_info->scrub_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+	}
+	WARN_ON(fs_info->scrub_workers_refcnt < 0);
+}
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace)
+{
+	struct scrub_ctx *sctx;
+	int ret;
+	struct btrfs_device *dev;
+
+	if (btrfs_fs_closing(fs_info))
+		return -EINVAL;
+
+	/*
+	 * check some assumptions
+	 */
+	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
+		btrfs_err(fs_info,
+			   "scrub: size assumption nodesize == leafsize (%d == %d) fails",
+		       fs_info->chunk_root->nodesize,
+		       fs_info->chunk_root->leafsize);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
+		/*
+		 * in this case scrub is unable to calculate the checksum
+		 * the way scrub is implemented. Do not handle this
+		 * situation at all because it won't ever happen.
+		 */
+		btrfs_err(fs_info,
+			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
+		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
+		/* not supported for data w/o checksums */
+		btrfs_err(fs_info,
+			   "scrub: size assumption sectorsize != PAGE_SIZE "
+			   "(%d != %lu) fails",
+		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->nodesize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+	    fs_info->chunk_root->sectorsize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+		/*
+		 * would exhaust the array bounds of pagev member in
+		 * struct scrub_block
+		 */
+		btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
+			   "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
+		       fs_info->chunk_root->nodesize,
+		       SCRUB_MAX_PAGES_PER_BLOCK,
+		       fs_info->chunk_root->sectorsize,
+		       SCRUB_MAX_PAGES_PER_BLOCK);
+		return -EINVAL;
+	}
+
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	if (!dev || (dev->missing && !is_dev_replace)) {
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -ENODEV;
+	}
+
+	mutex_lock(&fs_info->scrub_lock);
+	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -EIO;
+	}
+
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (dev->scrub_device ||
+	    (!is_dev_replace &&
+	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return -EINPROGRESS;
+	}
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	ret = scrub_workers_get(fs_info, is_dev_replace);
+	if (ret) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return ret;
+	}
+
+	sctx = scrub_setup_ctx(dev, is_dev_replace);
+	if (IS_ERR(sctx)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
+		return PTR_ERR(sctx);
+	}
+	sctx->readonly = readonly;
+	dev->scrub_device = sctx;
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+	/*
+	 * checking @scrub_pause_req here, we can avoid
+	 * race between committing transaction and scrubbing.
+	 */
+	__scrub_blocked_if_needed(fs_info);
+	atomic_inc(&fs_info->scrubs_running);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	if (!is_dev_replace) {
+		/*
+		 * by holding device list mutex, we can
+		 * kick off writing super in log tree sync.
+		 */
+		mutex_lock(&fs_info->fs_devices->device_list_mutex);
+		ret = scrub_supers(sctx, dev);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	}
+
+	if (!ret)
+		ret = scrub_enumerate_chunks(sctx, dev, start, end,
+					     is_dev_replace);
+
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+	atomic_dec(&fs_info->scrubs_running);
+	wake_up(&fs_info->scrub_pause_wait);
+
+	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
+
+	if (progress)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
+
+	mutex_lock(&fs_info->scrub_lock);
+	dev->scrub_device = NULL;
+	scrub_workers_put(fs_info);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	scrub_free_ctx(sctx);
+
+	return ret;
+}
+
+void btrfs_scrub_pause(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrub_pause_req);
+	while (atomic_read(&fs_info->scrubs_paused) !=
+	       atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrubs_paused) ==
+			   atomic_read(&fs_info->scrubs_running));
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	mutex_unlock(&fs_info->scrub_lock);
+}
+
+void btrfs_scrub_continue(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	atomic_dec(&fs_info->scrub_pause_req);
+	wake_up(&fs_info->scrub_pause_wait);
+}
+
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+{
+	mutex_lock(&fs_info->scrub_lock);
+	if (!atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->scrub_cancel_req);
+	while (atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrubs_running) == 0);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	atomic_dec(&fs_info->scrub_cancel_req);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+			   struct btrfs_device *dev)
+{
+	struct scrub_ctx *sctx;
+
+	mutex_lock(&fs_info->scrub_lock);
+	sctx = dev->scrub_device;
+	if (!sctx) {
+		mutex_unlock(&fs_info->scrub_lock);
+		return -ENOTCONN;
+	}
+	atomic_inc(&sctx->cancel_req);
+	while (dev->scrub_device) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+			   dev->scrub_device == NULL);
+		mutex_lock(&fs_info->scrub_lock);
+	}
+	mutex_unlock(&fs_info->scrub_lock);
+
+	return 0;
+}
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress)
+{
+	struct btrfs_device *dev;
+	struct scrub_ctx *sctx = NULL;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+	if (dev)
+		sctx = dev->scrub_device;
+	if (sctx)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num)
+{
+	u64 mapped_length;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	mapped_length = extent_len;
+	ret = btrfs_map_block(fs_info, READ, extent_logical,
+			      &mapped_length, &bbio, 0);
+	if (ret || !bbio || mapped_length < extent_len ||
+	    !bbio->stripes[0].dev->bdev) {
+		kfree(bbio);
+		return;
+	}
+
+	*extent_physical = bbio->stripes[0].physical;
+	*extent_mirror_num = bbio->mirror_num;
+	*extent_dev = bbio->stripes[0].dev;
+	kfree(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace)
+{
+	WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+	mutex_init(&wr_ctx->wr_lock);
+	wr_ctx->wr_curr_bio = NULL;
+	if (!is_dev_replace)
+		return 0;
+
+	WARN_ON(!dev->bdev);
+	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	wr_ctx->tgtdev = dev;
+	atomic_set(&wr_ctx->flush_all_writes, 0);
+	return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+	mutex_lock(&wr_ctx->wr_lock);
+	kfree(wr_ctx->wr_curr_bio);
+	wr_ctx->wr_curr_bio = NULL;
+	mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+	if (!nocow_ctx) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	scrub_pending_trans_workers_inc(sctx);
+
+	nocow_ctx->sctx = sctx;
+	nocow_ctx->logical = logical;
+	nocow_ctx->len = len;
+	nocow_ctx->mirror_num = mirror_num;
+	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+	btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
+	INIT_LIST_HEAD(&nocow_ctx->inodes);
+	btrfs_queue_work(fs_info->scrub_nocow_workers,
+			 &nocow_ctx->work);
+
+	return 0;
+}
+
+static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+	struct scrub_nocow_inode *nocow_inode;
+
+	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
+	if (!nocow_inode)
+		return -ENOMEM;
+	nocow_inode->inum = inum;
+	nocow_inode->offset = offset;
+	nocow_inode->root = root;
+	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
+	return 0;
+}
+
+#define COPY_COMPLETE 1
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx =
+		container_of(work, struct scrub_copy_nocow_ctx, work);
+	struct scrub_ctx *sctx = nocow_ctx->sctx;
+	u64 logical = nocow_ctx->logical;
+	u64 len = nocow_ctx->len;
+	int mirror_num = nocow_ctx->mirror_num;
+	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	int not_written = 0;
+
+	fs_info = sctx->dev_root->fs_info;
+	root = fs_info->extent_root;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		not_written = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		not_written = 1;
+		goto out;
+	}
+
+	ret = iterate_inodes_from_logical(logical, fs_info, path,
+					  record_inode_for_nocow, nocow_ctx);
+	if (ret != 0 && ret != -ENOENT) {
+		btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
+			"phys %llu, len %llu, mir %u, ret %d",
+			logical, physical_for_dev_replace, len, mirror_num,
+			ret);
+		not_written = 1;
+		goto out;
+	}
+
+	btrfs_end_transaction(trans, root);
+	trans = NULL;
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
+						 entry->root, nocow_ctx);
+		kfree(entry);
+		if (ret == COPY_COMPLETE) {
+			ret = 0;
+			break;
+		} else if (ret) {
+			break;
+		}
+	}
+out:
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, root);
+	if (not_written)
+		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+					    num_uncorrectable_read_errors);
+
+	btrfs_free_path(path);
+	kfree(nocow_ctx);
+
+	scrub_pending_trans_workers_dec(sctx);
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *nocow_ctx)
+{
+	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+	struct btrfs_key key;
+	struct inode *inode;
+	struct page *page;
+	struct btrfs_root *local_root;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_map *em;
+	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *io_tree;
+	u64 physical_for_dev_replace;
+	u64 len = nocow_ctx->len;
+	u64 lockstart = offset, lockend = offset + len - 1;
+	unsigned long index;
+	int srcu_index;
+	int ret = 0;
+	int err = 0;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+		return PTR_ERR(local_root);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	/* Avoid truncate/dio/punch hole.. */
+	mutex_lock(&inode->i_mutex);
+	inode_dio_wait(inode);
+
+	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	io_tree = &BTRFS_I(inode)->io_tree;
+
+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock;
+	}
+
+	/*
+	 * This extent does not actually cover the logical extent anymore,
+	 * move on to the next inode.
+	 */
+	if (em->block_start > nocow_ctx->logical ||
+	    em->block_start + em->block_len < nocow_ctx->logical + len) {
+		free_extent_map(em);
+		goto out_unlock;
+	}
+	free_extent_map(em);
+
+	while (len >= PAGE_CACHE_SIZE) {
+		index = offset >> PAGE_CACHE_SHIFT;
+again:
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			btrfs_err(fs_info, "find_or_create_page() failed");
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (PageUptodate(page)) {
+			if (PageDirty(page))
+				goto next_page;
+		} else {
+			ClearPageError(page);
+			err = extent_read_full_page_nolock(io_tree, page,
+							   btrfs_get_extent,
+							   nocow_ctx->mirror_num);
+			if (err) {
+				ret = err;
+				goto next_page;
+			}
+
+			lock_page(page);
+			/*
+			 * If the page has been remove from the page cache,
+			 * the data on it is meaningless, because it may be
+			 * old one, the new data may be written into the new
+			 * page in the page cache.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
+			if (!PageUptodate(page)) {
+				ret = -EIO;
+				goto next_page;
+			}
+		}
+		err = write_page_nocow(nocow_ctx->sctx,
+				       physical_for_dev_replace, page);
+		if (err)
+			ret = err;
+next_page:
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (ret)
+			break;
+
+		offset += PAGE_CACHE_SIZE;
+		physical_for_dev_replace += PAGE_CACHE_SIZE;
+		len -= PAGE_CACHE_SIZE;
+	}
+	ret = COPY_COMPLETE;
+out_unlock:
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+			     GFP_NOFS);
+out:
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+	return ret;
+}
+
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page)
+{
+	struct bio *bio;
+	struct btrfs_device *dev;
+	int ret;
+
+	dev = sctx->wr_ctx.tgtdev;
+	if (!dev)
+		return -EIO;
+	if (!dev->bdev) {
+		printk_ratelimited(KERN_WARNING
+			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+		return -EIO;
+	}
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
+	bio->bi_bdev = dev->bdev;
+	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		return -EIO;
+	}
+
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
+		goto leave_with_eio;
+
+	bio_put(bio);
+	return 0;
+}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 00000000000..6528aa66218
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,5791 @@
+/*
+ * Copyright (C) 2012 Alexander Block.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bsearch.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sort.h>
+#include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/radix-tree.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+
+#include "send.h"
+#include "backref.h"
+#include "hash.h"
+#include "locking.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+
+static int g_verbose = 0;
+
+#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
+
+/*
+ * A fs_path is a helper to dynamically build path names with unknown size.
+ * It reallocates the internal buffer on demand.
+ * It allows fast adding of path elements on the right side (normal path) and
+ * fast adding to the left side (reversed path). A reversed path can also be
+ * unreversed if needed.
+ */
+struct fs_path {
+	union {
+		struct {
+			char *start;
+			char *end;
+
+			char *buf;
+			unsigned short buf_len:15;
+			unsigned short reversed:1;
+			char inline_buf[];
+		};
+		/*
+		 * Average path length does not exceed 200 bytes, we'll have
+		 * better packing in the slab and higher chance to satisfy
+		 * a allocation later during send.
+		 */
+		char pad[256];
+	};
+};
+#define FS_PATH_INLINE_SIZE \
+	(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+
+
+/* reused for each extent */
+struct clone_root {
+	struct btrfs_root *root;
+	u64 ino;
+	u64 offset;
+
+	u64 found_refs;
+};
+
+#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
+#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+
+struct send_ctx {
+	struct file *send_filp;
+	loff_t send_off;
+	char *send_buf;
+	u32 send_size;
+	u32 send_max_size;
+	u64 total_send_size;
+	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
+
+	struct btrfs_root *send_root;
+	struct btrfs_root *parent_root;
+	struct clone_root *clone_roots;
+	int clone_roots_cnt;
+
+	/* current state of the compare_tree call */
+	struct btrfs_path *left_path;
+	struct btrfs_path *right_path;
+	struct btrfs_key *cmp_key;
+
+	/*
+	 * infos of the currently processed inode. In case of deleted inodes,
+	 * these are the values from the deleted inode.
+	 */
+	u64 cur_ino;
+	u64 cur_inode_gen;
+	int cur_inode_new;
+	int cur_inode_new_gen;
+	int cur_inode_deleted;
+	u64 cur_inode_size;
+	u64 cur_inode_mode;
+	u64 cur_inode_rdev;
+	u64 cur_inode_last_extent;
+
+	u64 send_progress;
+
+	struct list_head new_refs;
+	struct list_head deleted_refs;
+
+	struct radix_tree_root name_cache;
+	struct list_head name_cache_list;
+	int name_cache_size;
+
+	struct file_ra_state ra;
+
+	char *read_buf;
+
+	/*
+	 * We process inodes by their increasing order, so if before an
+	 * incremental send we reverse the parent/child relationship of
+	 * directories such that a directory with a lower inode number was
+	 * the parent of a directory with a higher inode number, and the one
+	 * becoming the new parent got renamed too, we can't rename/move the
+	 * directory with lower inode number when we finish processing it - we
+	 * must process the directory with higher inode number first, then
+	 * rename/move it and then rename/move the directory with lower inode
+	 * number. Example follows.
+	 *
+	 * Tree state when the first send was performed:
+	 *
+	 * .
+	 * |-- a                   (ino 257)
+	 *     |-- b               (ino 258)
+	 *         |
+	 *         |
+	 *         |-- c           (ino 259)
+	 *         |   |-- d       (ino 260)
+	 *         |
+	 *         |-- c2          (ino 261)
+	 *
+	 * Tree state when the second (incremental) send is performed:
+	 *
+	 * .
+	 * |-- a                   (ino 257)
+	 *     |-- b               (ino 258)
+	 *         |-- c2          (ino 261)
+	 *             |-- d2      (ino 260)
+	 *                 |-- cc  (ino 259)
+	 *
+	 * The sequence of steps that lead to the second state was:
+	 *
+	 * mv /a/b/c/d /a/b/c2/d2
+	 * mv /a/b/c /a/b/c2/d2/cc
+	 *
+	 * "c" has lower inode number, but we can't move it (2nd mv operation)
+	 * before we move "d", which has higher inode number.
+	 *
+	 * So we just memorize which move/rename operations must be performed
+	 * later when their respective parent is processed and moved/renamed.
+	 */
+
+	/* Indexed by parent directory inode number. */
+	struct rb_root pending_dir_moves;
+
+	/*
+	 * Reverse index, indexed by the inode number of a directory that
+	 * is waiting for the move/rename of its immediate parent before its
+	 * own move/rename can be performed.
+	 */
+	struct rb_root waiting_dir_moves;
+
+	/*
+	 * A directory that is going to be rm'ed might have a child directory
+	 * which is in the pending directory moves index above. In this case,
+	 * the directory can only be removed after the move/rename of its child
+	 * is performed. Example:
+	 *
+	 * Parent snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- c/           (ino 259)
+	 *         |   |-- x/       (ino 260)
+	 *         |
+	 *         |-- y/           (ino 261)
+	 *
+	 * Send snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- YY/          (ino 261)
+	 *              |-- x/      (ino 260)
+	 *
+	 * Sequence of steps that lead to the send snapshot:
+	 * rm -f /a/b/c/foo.txt
+	 * mv /a/b/y /a/b/YY
+	 * mv /a/b/c/x /a/b/YY
+	 * rmdir /a/b/c
+	 *
+	 * When the child is processed, its move/rename is delayed until its
+	 * parent is processed (as explained above), but all other operations
+	 * like update utimes, chown, chgrp, etc, are performed and the paths
+	 * that it uses for those operations must use the orphanized name of
+	 * its parent (the directory we're going to rm later), so we need to
+	 * memorize that name.
+	 *
+	 * Indexed by the inode number of the directory to be deleted.
+	 */
+	struct rb_root orphan_dirs;
+};
+
+struct pending_dir_move {
+	struct rb_node node;
+	struct list_head list;
+	u64 parent_ino;
+	u64 ino;
+	u64 gen;
+	struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+	struct rb_node node;
+	u64 ino;
+	/*
+	 * There might be some directory that could not be removed because it
+	 * was waiting for this directory inode to be moved first. Therefore
+	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+	 */
+	u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+	struct rb_node node;
+	u64 ino;
+	u64 gen;
+};
+
+struct name_cache_entry {
+	struct list_head list;
+	/*
+	 * radix_tree has only 32bit entries but we need to handle 64bit inums.
+	 * We use the lower 32bit of the 64bit inum to store it in the tree. If
+	 * more then one inum would fall into the same entry, we use radix_list
+	 * to store the additional entries. radix_list is also used to store
+	 * entries where two entries have the same inum but different
+	 * generations.
+	 */
+	struct list_head radix_list;
+	u64 ino;
+	u64 gen;
+	u64 parent_ino;
+	u64 parent_gen;
+	int ret;
+	int need_later_update;
+	int name_len;
+	char name[];
+};
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
+static int need_send_hole(struct send_ctx *sctx)
+{
+	return (sctx->parent_root && !sctx->cur_inode_new &&
+		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+		S_ISREG(sctx->cur_inode_mode));
+}
+
+static void fs_path_reset(struct fs_path *p)
+{
+	if (p->reversed) {
+		p->start = p->buf + p->buf_len - 1;
+		p->end = p->start;
+		*p->start = 0;
+	} else {
+		p->start = p->buf;
+		p->end = p->start;
+		*p->start = 0;
+	}
+}
+
+static struct fs_path *fs_path_alloc(void)
+{
+	struct fs_path *p;
+
+	p = kmalloc(sizeof(*p), GFP_NOFS);
+	if (!p)
+		return NULL;
+	p->reversed = 0;
+	p->buf = p->inline_buf;
+	p->buf_len = FS_PATH_INLINE_SIZE;
+	fs_path_reset(p);
+	return p;
+}
+
+static struct fs_path *fs_path_alloc_reversed(void)
+{
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return NULL;
+	p->reversed = 1;
+	fs_path_reset(p);
+	return p;
+}
+
+static void fs_path_free(struct fs_path *p)
+{
+	if (!p)
+		return;
+	if (p->buf != p->inline_buf)
+		kfree(p->buf);
+	kfree(p);
+}
+
+static int fs_path_len(struct fs_path *p)
+{
+	return p->end - p->start;
+}
+
+static int fs_path_ensure_buf(struct fs_path *p, int len)
+{
+	char *tmp_buf;
+	int path_len;
+	int old_buf_len;
+
+	len++;
+
+	if (p->buf_len >= len)
+		return 0;
+
+	if (len > PATH_MAX) {
+		WARN_ON(1);
+		return -ENOMEM;
+	}
+
+	path_len = p->end - p->start;
+	old_buf_len = p->buf_len;
+
+	/*
+	 * First time the inline_buf does not suffice
+	 */
+	if (p->buf == p->inline_buf) {
+		tmp_buf = kmalloc(len, GFP_NOFS);
+		if (tmp_buf)
+			memcpy(tmp_buf, p->buf, old_buf_len);
+	} else {
+		tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+	}
+	if (!tmp_buf)
+		return -ENOMEM;
+	p->buf = tmp_buf;
+	/*
+	 * The real size of the buffer is bigger, this will let the fast path
+	 * happen most of the time
+	 */
+	p->buf_len = ksize(p->buf);
+
+	if (p->reversed) {
+		tmp_buf = p->buf + old_buf_len - path_len - 1;
+		p->end = p->buf + p->buf_len - 1;
+		p->start = p->end - path_len;
+		memmove(p->start, tmp_buf, path_len + 1);
+	} else {
+		p->start = p->buf;
+		p->end = p->start + path_len;
+	}
+	return 0;
+}
+
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+				   char **prepared)
+{
+	int ret;
+	int new_len;
+
+	new_len = p->end - p->start + name_len;
+	if (p->start != p->end)
+		new_len++;
+	ret = fs_path_ensure_buf(p, new_len);
+	if (ret < 0)
+		goto out;
+
+	if (p->reversed) {
+		if (p->start != p->end)
+			*--p->start = '/';
+		p->start -= name_len;
+		*prepared = p->start;
+	} else {
+		if (p->start != p->end)
+			*p->end++ = '/';
+		*prepared = p->end;
+		p->end += name_len;
+		*p->end = 0;
+	}
+
+out:
+	return ret;
+}
+
+static int fs_path_add(struct fs_path *p, const char *name, int name_len)
+{
+	int ret;
+	char *prepared;
+
+	ret = fs_path_prepare_for_add(p, name_len, &prepared);
+	if (ret < 0)
+		goto out;
+	memcpy(prepared, name, name_len);
+
+out:
+	return ret;
+}
+
+static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+{
+	int ret;
+	char *prepared;
+
+	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
+	if (ret < 0)
+		goto out;
+	memcpy(prepared, p2->start, p2->end - p2->start);
+
+out:
+	return ret;
+}
+
+static int fs_path_add_from_extent_buffer(struct fs_path *p,
+					  struct extent_buffer *eb,
+					  unsigned long off, int len)
+{
+	int ret;
+	char *prepared;
+
+	ret = fs_path_prepare_for_add(p, len, &prepared);
+	if (ret < 0)
+		goto out;
+
+	read_extent_buffer(eb, prepared, off, len);
+
+out:
+	return ret;
+}
+
+static int fs_path_copy(struct fs_path *p, struct fs_path *from)
+{
+	int ret;
+
+	p->reversed = from->reversed;
+	fs_path_reset(p);
+
+	ret = fs_path_add_path(p, from);
+
+	return ret;
+}
+
+
+static void fs_path_unreverse(struct fs_path *p)
+{
+	char *tmp;
+	int len;
+
+	if (!p->reversed)
+		return;
+
+	tmp = p->start;
+	len = p->end - p->start;
+	p->start = p->buf;
+	p->end = p->start + len;
+	memmove(p->start, tmp, len + 1);
+	p->reversed = 0;
+}
+
+static struct btrfs_path *alloc_path_for_send(void)
+{
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return NULL;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	path->need_commit_sem = 1;
+	return path;
+}
+
+static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
+{
+	int ret;
+	mm_segment_t old_fs;
+	u32 pos = 0;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	while (pos < len) {
+		ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
+		/* TODO handle that correctly */
+		/*if (ret == -ERESTARTSYS) {
+			continue;
+		}*/
+		if (ret < 0)
+			goto out;
+		if (ret == 0) {
+			ret = -EIO;
+			goto out;
+		}
+		pos += ret;
+	}
+
+	ret = 0;
+
+out:
+	set_fs(old_fs);
+	return ret;
+}
+
+static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
+{
+	struct btrfs_tlv_header *hdr;
+	int total_len = sizeof(*hdr) + len;
+	int left = sctx->send_max_size - sctx->send_size;
+
+	if (unlikely(left < total_len))
+		return -EOVERFLOW;
+
+	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
+	hdr->tlv_type = cpu_to_le16(attr);
+	hdr->tlv_len = cpu_to_le16(len);
+	memcpy(hdr + 1, data, len);
+	sctx->send_size += total_len;
+
+	return 0;
+}
+
+#define TLV_PUT_DEFINE_INT(bits) \
+	static int tlv_put_u##bits(struct send_ctx *sctx,	 	\
+			u##bits attr, u##bits value)			\
+	{								\
+		__le##bits __tmp = cpu_to_le##bits(value);		\
+		return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));	\
+	}
+
+TLV_PUT_DEFINE_INT(64)
+
+static int tlv_put_string(struct send_ctx *sctx, u16 attr,
+			  const char *str, int len)
+{
+	if (len == -1)
+		len = strlen(str);
+	return tlv_put(sctx, attr, str, len);
+}
+
+static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
+			const u8 *uuid)
+{
+	return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
+}
+
+static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
+				  struct extent_buffer *eb,
+				  struct btrfs_timespec *ts)
+{
+	struct btrfs_timespec bts;
+	read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
+	return tlv_put(sctx, attr, &bts, sizeof(bts));
+}
+
+
+#define TLV_PUT(sctx, attrtype, attrlen, data) \
+	do { \
+		ret = tlv_put(sctx, attrtype, attrlen, data); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+
+#define TLV_PUT_INT(sctx, attrtype, bits, value) \
+	do { \
+		ret = tlv_put_u##bits(sctx, attrtype, value); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+
+#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
+#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
+#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
+#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
+#define TLV_PUT_STRING(sctx, attrtype, str, len) \
+	do { \
+		ret = tlv_put_string(sctx, attrtype, str, len); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+#define TLV_PUT_PATH(sctx, attrtype, p) \
+	do { \
+		ret = tlv_put_string(sctx, attrtype, p->start, \
+			p->end - p->start); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while(0)
+#define TLV_PUT_UUID(sctx, attrtype, uuid) \
+	do { \
+		ret = tlv_put_uuid(sctx, attrtype, uuid); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
+	do { \
+		ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
+		if (ret < 0) \
+			goto tlv_put_failure; \
+	} while (0)
+
+static int send_header(struct send_ctx *sctx)
+{
+	struct btrfs_stream_header hdr;
+
+	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
+
+	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+					&sctx->send_off);
+}
+
+/*
+ * For each command/item we want to send to userspace, we call this function.
+ */
+static int begin_cmd(struct send_ctx *sctx, int cmd)
+{
+	struct btrfs_cmd_header *hdr;
+
+	if (WARN_ON(!sctx->send_buf))
+		return -EINVAL;
+
+	BUG_ON(sctx->send_size);
+
+	sctx->send_size += sizeof(*hdr);
+	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+	hdr->cmd = cpu_to_le16(cmd);
+
+	return 0;
+}
+
+static int send_cmd(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_cmd_header *hdr;
+	u32 crc;
+
+	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+	hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
+	hdr->crc = 0;
+
+	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+	hdr->crc = cpu_to_le32(crc);
+
+	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+					&sctx->send_off);
+
+	sctx->total_send_size += sctx->send_size;
+	sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+	sctx->send_size = 0;
+
+	return ret;
+}
+
+/*
+ * Sends a move instruction to user space
+ */
+static int send_rename(struct send_ctx *sctx,
+		     struct fs_path *from, struct fs_path *to)
+{
+	int ret;
+
+verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Sends a link instruction to user space
+ */
+static int send_link(struct send_ctx *sctx,
+		     struct fs_path *path, struct fs_path *lnk)
+{
+	int ret;
+
+verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Sends an unlink instruction to user space
+ */
+static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
+{
+	int ret;
+
+verbose_printk("btrfs: send_unlink %s\n", path->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Sends a rmdir instruction to user space
+ */
+static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
+{
+	int ret;
+
+verbose_printk("btrfs: send_rmdir %s\n", path->start);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+/*
+ * Helper function to retrieve some fields from an inode item.
+ */
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
+			  u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
+			  u64 *gid, u64 *rdev)
+{
+	int ret;
+	struct btrfs_inode_item *ii;
+	struct btrfs_key key;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		return ret;
+	}
+
+	ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_inode_item);
+	if (size)
+		*size = btrfs_inode_size(path->nodes[0], ii);
+	if (gen)
+		*gen = btrfs_inode_generation(path->nodes[0], ii);
+	if (mode)
+		*mode = btrfs_inode_mode(path->nodes[0], ii);
+	if (uid)
+		*uid = btrfs_inode_uid(path->nodes[0], ii);
+	if (gid)
+		*gid = btrfs_inode_gid(path->nodes[0], ii);
+	if (rdev)
+		*rdev = btrfs_inode_rdev(path->nodes[0], ii);
+
+	return ret;
+}
+
+static int get_inode_info(struct btrfs_root *root,
+			  u64 ino, u64 *size, u64 *gen,
+			  u64 *mode, u64 *uid, u64 *gid,
+			  u64 *rdev)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+	ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+			       rdev);
+	btrfs_free_path(path);
+	return ret;
+}
+
+typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
+				   struct fs_path *p,
+				   void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_inode_ref or
+ * btrfs_inode_extref.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the INODE_REF or INODE_EXTREF when called.
+ */
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+			     struct btrfs_key *found_key, int resolve,
+			     iterate_inode_ref_t iterate, void *ctx)
+{
+	struct extent_buffer *eb = path->nodes[0];
+	struct btrfs_item *item;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_inode_extref *extref;
+	struct btrfs_path *tmp_path;
+	struct fs_path *p;
+	u32 cur = 0;
+	u32 total;
+	int slot = path->slots[0];
+	u32 name_len;
+	char *start;
+	int ret = 0;
+	int num = 0;
+	int index;
+	u64 dir;
+	unsigned long name_off;
+	unsigned long elem_size;
+	unsigned long ptr;
+
+	p = fs_path_alloc_reversed();
+	if (!p)
+		return -ENOMEM;
+
+	tmp_path = alloc_path_for_send();
+	if (!tmp_path) {
+		fs_path_free(p);
+		return -ENOMEM;
+	}
+
+
+	if (found_key->type == BTRFS_INODE_REF_KEY) {
+		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+						    struct btrfs_inode_ref);
+		item = btrfs_item_nr(slot);
+		total = btrfs_item_size(eb, item);
+		elem_size = sizeof(*iref);
+	} else {
+		ptr = btrfs_item_ptr_offset(eb, slot);
+		total = btrfs_item_size_nr(eb, slot);
+		elem_size = sizeof(*extref);
+	}
+
+	while (cur < total) {
+		fs_path_reset(p);
+
+		if (found_key->type == BTRFS_INODE_REF_KEY) {
+			iref = (struct btrfs_inode_ref *)(ptr + cur);
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			name_off = (unsigned long)(iref + 1);
+			index = btrfs_inode_ref_index(eb, iref);
+			dir = found_key->offset;
+		} else {
+			extref = (struct btrfs_inode_extref *)(ptr + cur);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			name_off = (unsigned long)&extref->name;
+			index = btrfs_inode_extref_index(eb, extref);
+			dir = btrfs_inode_extref_parent(eb, extref);
+		}
+
+		if (resolve) {
+			start = btrfs_ref_to_path(root, tmp_path, name_len,
+						  name_off, eb, dir,
+						  p->buf, p->buf_len);
+			if (IS_ERR(start)) {
+				ret = PTR_ERR(start);
+				goto out;
+			}
+			if (start < p->buf) {
+				/* overflow , try again with larger buffer */
+				ret = fs_path_ensure_buf(p,
+						p->buf_len + p->buf - start);
+				if (ret < 0)
+					goto out;
+				start = btrfs_ref_to_path(root, tmp_path,
+							  name_len, name_off,
+							  eb, dir,
+							  p->buf, p->buf_len);
+				if (IS_ERR(start)) {
+					ret = PTR_ERR(start);
+					goto out;
+				}
+				BUG_ON(start < p->buf);
+			}
+			p->start = start;
+		} else {
+			ret = fs_path_add_from_extent_buffer(p, eb, name_off,
+							     name_len);
+			if (ret < 0)
+				goto out;
+		}
+
+		cur += elem_size + name_len;
+		ret = iterate(num, dir, index, p, ctx);
+		if (ret)
+			goto out;
+		num++;
+	}
+
+out:
+	btrfs_free_path(tmp_path);
+	fs_path_free(p);
+	return ret;
+}
+
+typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
+				  const char *name, int name_len,
+				  const char *data, int data_len,
+				  u8 type, void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_dir_item.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the dir item when called.
+ */
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *found_key,
+			    iterate_dir_item_t iterate, void *ctx)
+{
+	int ret = 0;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key di_key;
+	char *buf = NULL;
+	int buf_len;
+	u32 name_len;
+	u32 data_len;
+	u32 cur;
+	u32 len;
+	u32 total;
+	int slot;
+	int num;
+	u8 type;
+
+	if (found_key->type == BTRFS_XATTR_ITEM_KEY)
+		buf_len = BTRFS_MAX_XATTR_SIZE(root);
+	else
+		buf_len = PATH_MAX;
+
+	buf = kmalloc(buf_len, GFP_NOFS);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item = btrfs_item_nr(slot);
+	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+	cur = 0;
+	len = 0;
+	total = btrfs_item_size(eb, item);
+
+	num = 0;
+	while (cur < total) {
+		name_len = btrfs_dir_name_len(eb, di);
+		data_len = btrfs_dir_data_len(eb, di);
+		type = btrfs_dir_type(eb, di);
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+		if (type == BTRFS_FT_XATTR) {
+			if (name_len > XATTR_NAME_MAX) {
+				ret = -ENAMETOOLONG;
+				goto out;
+			}
+			if (name_len + data_len > buf_len) {
+				ret = -E2BIG;
+				goto out;
+			}
+		} else {
+			/*
+			 * Path too long
+			 */
+			if (name_len + data_len > buf_len) {
+				ret = -ENAMETOOLONG;
+				goto out;
+			}
+		}
+
+		read_extent_buffer(eb, buf, (unsigned long)(di + 1),
+				name_len + data_len);
+
+		len = sizeof(*di) + name_len + data_len;
+		di = (struct btrfs_dir_item *)((char *)di + len);
+		cur += len;
+
+		ret = iterate(num, &di_key, buf, name_len, buf + name_len,
+				data_len, type, ctx);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+
+		num++;
+	}
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+static int __copy_first_ref(int num, u64 dir, int index,
+			    struct fs_path *p, void *ctx)
+{
+	int ret;
+	struct fs_path *pt = ctx;
+
+	ret = fs_path_copy(pt, p);
+	if (ret < 0)
+		return ret;
+
+	/* we want the first only */
+	return 1;
+}
+
+/*
+ * Retrieve the first path of an inode. If an inode has more then one
+ * ref/hardlink, this is ignored.
+ */
+static int get_inode_path(struct btrfs_root *root,
+			  u64 ino, struct fs_path *path)
+{
+	int ret;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *p;
+
+	p = alloc_path_for_send();
+	if (!p)
+		return -ENOMEM;
+
+	fs_path_reset(path);
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = 1;
+		goto out;
+	}
+	btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
+	if (found_key.objectid != ino ||
+	    (found_key.type != BTRFS_INODE_REF_KEY &&
+	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = iterate_inode_ref(root, p, &found_key, 1,
+				__copy_first_ref, path);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	btrfs_free_path(p);
+	return ret;
+}
+
+struct backref_ctx {
+	struct send_ctx *sctx;
+
+	struct btrfs_path *path;
+	/* number of total found references */
+	u64 found;
+
+	/*
+	 * used for clones found in send_root. clones found behind cur_objectid
+	 * and cur_offset are not considered as allowed clones.
+	 */
+	u64 cur_objectid;
+	u64 cur_offset;
+
+	/* may be truncated in case it's the last extent in a file */
+	u64 extent_len;
+
+	/* Just to check for bugs in backref resolving */
+	int found_itself;
+};
+
+static int __clone_root_cmp_bsearch(const void *key, const void *elt)
+{
+	u64 root = (u64)(uintptr_t)key;
+	struct clone_root *cr = (struct clone_root *)elt;
+
+	if (root < cr->root->objectid)
+		return -1;
+	if (root > cr->root->objectid)
+		return 1;
+	return 0;
+}
+
+static int __clone_root_cmp_sort(const void *e1, const void *e2)
+{
+	struct clone_root *cr1 = (struct clone_root *)e1;
+	struct clone_root *cr2 = (struct clone_root *)e2;
+
+	if (cr1->root->objectid < cr2->root->objectid)
+		return -1;
+	if (cr1->root->objectid > cr2->root->objectid)
+		return 1;
+	return 0;
+}
+
+/*
+ * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ */
+static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+{
+	struct backref_ctx *bctx = ctx_;
+	struct clone_root *found;
+	int ret;
+	u64 i_size;
+
+	/* First check if the root is in the list of accepted clone sources */
+	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
+			bctx->sctx->clone_roots_cnt,
+			sizeof(struct clone_root),
+			__clone_root_cmp_bsearch);
+	if (!found)
+		return 0;
+
+	if (found->root == bctx->sctx->send_root &&
+	    ino == bctx->cur_objectid &&
+	    offset == bctx->cur_offset) {
+		bctx->found_itself = 1;
+	}
+
+	/*
+	 * There are inodes that have extents that lie behind its i_size. Don't
+	 * accept clones from these extents.
+	 */
+	ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
+			       NULL, NULL, NULL);
+	btrfs_release_path(bctx->path);
+	if (ret < 0)
+		return ret;
+
+	if (offset + bctx->extent_len > i_size)
+		return 0;
+
+	/*
+	 * Make sure we don't consider clones from send_root that are
+	 * behind the current inode/offset.
+	 */
+	if (found->root == bctx->sctx->send_root) {
+		/*
+		 * TODO for the moment we don't accept clones from the inode
+		 * that is currently send. We may change this when
+		 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
+		 * file.
+		 */
+		if (ino >= bctx->cur_objectid)
+			return 0;
+#if 0
+		if (ino > bctx->cur_objectid)
+			return 0;
+		if (offset + bctx->extent_len > bctx->cur_offset)
+			return 0;
+#endif
+	}
+
+	bctx->found++;
+	found->found_refs++;
+	if (ino < found->ino) {
+		found->ino = ino;
+		found->offset = offset;
+	} else if (found->ino == ino) {
+		/*
+		 * same extent found more then once in the same file.
+		 */
+		if (found->offset > offset + bctx->extent_len)
+			found->offset = offset;
+	}
+
+	return 0;
+}
+
+/*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
+ * path must point to the extent item when called.
+ */
+static int find_extent_clone(struct send_ctx *sctx,
+			     struct btrfs_path *path,
+			     u64 ino, u64 data_offset,
+			     u64 ino_size,
+			     struct clone_root **found)
+{
+	int ret;
+	int extent_type;
+	u64 logical;
+	u64 disk_byte;
+	u64 num_bytes;
+	u64 extent_item_pos;
+	u64 flags = 0;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *eb = path->nodes[0];
+	struct backref_ctx *backref_ctx = NULL;
+	struct clone_root *cur_clone_root;
+	struct btrfs_key found_key;
+	struct btrfs_path *tmp_path;
+	int compressed;
+	u32 i;
+
+	tmp_path = alloc_path_for_send();
+	if (!tmp_path)
+		return -ENOMEM;
+
+	/* We only use this path under the commit sem */
+	tmp_path->need_commit_sem = 0;
+
+	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+	if (!backref_ctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	backref_ctx->path = tmp_path;
+
+	if (data_offset >= ino_size) {
+		/*
+		 * There may be extents that lie behind the file's size.
+		 * I at least had this in combination with snapshotting while
+		 * writing large files.
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(eb, path->slots[0],
+			struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(eb, fi);
+	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+		ret = -ENOENT;
+		goto out;
+	}
+	compressed = btrfs_file_extent_compression(eb, fi);
+
+	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
+	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+	if (disk_byte == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+
+	down_read(&sctx->send_root->fs_info->commit_root_sem);
+	ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+				  &found_key, &flags);
+	up_read(&sctx->send_root->fs_info->commit_root_sem);
+	btrfs_release_path(tmp_path);
+
+	if (ret < 0)
+		goto out;
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Setup the clone roots.
+	 */
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		cur_clone_root = sctx->clone_roots + i;
+		cur_clone_root->ino = (u64)-1;
+		cur_clone_root->offset = 0;
+		cur_clone_root->found_refs = 0;
+	}
+
+	backref_ctx->sctx = sctx;
+	backref_ctx->found = 0;
+	backref_ctx->cur_objectid = ino;
+	backref_ctx->cur_offset = data_offset;
+	backref_ctx->found_itself = 0;
+	backref_ctx->extent_len = num_bytes;
+
+	/*
+	 * The last extent of a file may be too large due to page alignment.
+	 * We need to adjust extent_len in this case so that the checks in
+	 * __iterate_backrefs work.
+	 */
+	if (data_offset + num_bytes >= ino_size)
+		backref_ctx->extent_len = ino_size - data_offset;
+
+	/*
+	 * Now collect all backrefs.
+	 */
+	if (compressed == BTRFS_COMPRESS_NONE)
+		extent_item_pos = logical - found_key.objectid;
+	else
+		extent_item_pos = 0;
+	ret = iterate_extent_inodes(sctx->send_root->fs_info,
+					found_key.objectid, extent_item_pos, 1,
+					__iterate_backrefs, backref_ctx);
+
+	if (ret < 0)
+		goto out;
+
+	if (!backref_ctx->found_itself) {
+		/* found a bug in backref code? */
+		ret = -EIO;
+		btrfs_err(sctx->send_root->fs_info, "did not find backref in "
+				"send_root. inode=%llu, offset=%llu, "
+				"disk_byte=%llu found extent=%llu",
+				ino, data_offset, disk_byte, found_key.objectid);
+		goto out;
+	}
+
+verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
+		"ino=%llu, "
+		"num_bytes=%llu, logical=%llu\n",
+		data_offset, ino, num_bytes, logical);
+
+	if (!backref_ctx->found)
+		verbose_printk("btrfs:    no clones found\n");
+
+	cur_clone_root = NULL;
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		if (sctx->clone_roots[i].found_refs) {
+			if (!cur_clone_root)
+				cur_clone_root = sctx->clone_roots + i;
+			else if (sctx->clone_roots[i].root == sctx->send_root)
+				/* prefer clones from send_root over others */
+				cur_clone_root = sctx->clone_roots + i;
+		}
+
+	}
+
+	if (cur_clone_root) {
+		if (compressed != BTRFS_COMPRESS_NONE) {
+			/*
+			 * Offsets given by iterate_extent_inodes() are relative
+			 * to the start of the extent, we need to add logical
+			 * offset from the file extent item.
+			 * (See why at backref.c:check_extent_in_eb())
+			 */
+			cur_clone_root->offset += btrfs_file_extent_offset(eb,
+									   fi);
+		}
+		*found = cur_clone_root;
+		ret = 0;
+	} else {
+		ret = -ENOENT;
+	}
+
+out:
+	btrfs_free_path(tmp_path);
+	kfree(backref_ctx);
+	return ret;
+}
+
+static int read_symlink(struct btrfs_root *root,
+			u64 ino,
+			struct fs_path *dest)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *ei;
+	u8 type;
+	u8 compression;
+	unsigned long off;
+	int len;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	compression = btrfs_file_extent_compression(path->nodes[0], ei);
+	BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
+	BUG_ON(compression);
+
+	off = btrfs_file_extent_inline_start(ei);
+	len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
+
+	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Helper function to generate a file name that is unique in the root of
+ * send_root and parent_root. This is used to generate names for orphan inodes.
+ */
+static int gen_unique_name(struct send_ctx *sctx,
+			   u64 ino, u64 gen,
+			   struct fs_path *dest)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	char tmp[64];
+	int len;
+	u64 idx = 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
+				ino, gen, idx);
+		ASSERT(len < sizeof(tmp));
+
+		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
+				path, BTRFS_FIRST_FREE_OBJECTID,
+				tmp, strlen(tmp), 0);
+		btrfs_release_path(path);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (di) {
+			/* not unique, try again */
+			idx++;
+			continue;
+		}
+
+		if (!sctx->parent_root) {
+			/* unique */
+			ret = 0;
+			break;
+		}
+
+		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
+				path, BTRFS_FIRST_FREE_OBJECTID,
+				tmp, strlen(tmp), 0);
+		btrfs_release_path(path);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (di) {
+			/* not unique, try again */
+			idx++;
+			continue;
+		}
+		/* unique */
+		break;
+	}
+
+	ret = fs_path_add(dest, tmp, strlen(tmp));
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+enum inode_state {
+	inode_state_no_change,
+	inode_state_will_create,
+	inode_state_did_create,
+	inode_state_will_delete,
+	inode_state_did_delete,
+};
+
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret;
+	int left_ret;
+	int right_ret;
+	u64 left_gen;
+	u64 right_gen;
+
+	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
+			NULL, NULL);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	left_ret = ret;
+
+	if (!sctx->parent_root) {
+		right_ret = -ENOENT;
+	} else {
+		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
+				NULL, NULL, NULL, NULL);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		right_ret = ret;
+	}
+
+	if (!left_ret && !right_ret) {
+		if (left_gen == gen && right_gen == gen) {
+			ret = inode_state_no_change;
+		} else if (left_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_create;
+			else
+				ret = inode_state_will_create;
+		} else if (right_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_delete;
+			else
+				ret = inode_state_will_delete;
+		} else  {
+			ret = -ENOENT;
+		}
+	} else if (!left_ret) {
+		if (left_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_create;
+			else
+				ret = inode_state_will_create;
+		} else {
+			ret = -ENOENT;
+		}
+	} else if (!right_ret) {
+		if (right_gen == gen) {
+			if (ino < sctx->send_progress)
+				ret = inode_state_did_delete;
+			else
+				ret = inode_state_will_delete;
+		} else {
+			ret = -ENOENT;
+		}
+	} else {
+		ret = -ENOENT;
+	}
+
+out:
+	return ret;
+}
+
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret;
+
+	ret = get_cur_inode_state(sctx, ino, gen);
+	if (ret < 0)
+		goto out;
+
+	if (ret == inode_state_no_change ||
+	    ret == inode_state_did_create ||
+	    ret == inode_state_will_delete)
+		ret = 1;
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Helper function to lookup a dir item in a dir.
+ */
+static int lookup_dir_item_inode(struct btrfs_root *root,
+				 u64 dir, const char *name, int name_len,
+				 u64 *found_inode,
+				 u8 *found_type)
+{
+	int ret = 0;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	di = btrfs_lookup_dir_item(NULL, root, path,
+			dir, name, name_len, 0);
+	if (!di) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+	if (key.type == BTRFS_ROOT_ITEM_KEY) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*found_inode = key.objectid;
+	*found_type = btrfs_dir_type(path->nodes[0], di);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
+static int get_first_ref(struct btrfs_root *root, u64 ino,
+			 u64 *dir, u64 *dir_gen, struct fs_path *name)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	int len;
+	u64 parent_dir;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (!ret)
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				path->slots[0]);
+	if (ret || found_key.objectid != ino ||
+	    (found_key.type != BTRFS_INODE_REF_KEY &&
+	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (found_key.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *iref;
+		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(path->nodes[0], iref);
+		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+						     (unsigned long)(iref + 1),
+						     len);
+		parent_dir = found_key.offset;
+	} else {
+		struct btrfs_inode_extref *extref;
+		extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					struct btrfs_inode_extref);
+		len = btrfs_inode_extref_name_len(path->nodes[0], extref);
+		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+					(unsigned long)&extref->name, len);
+		parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
+	}
+	if (ret < 0)
+		goto out;
+	btrfs_release_path(path);
+
+	if (dir_gen) {
+		ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+	}
+
+	*dir = parent_dir;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int is_first_ref(struct btrfs_root *root,
+			u64 ino, u64 dir,
+			const char *name, int name_len)
+{
+	int ret;
+	struct fs_path *tmp_name;
+	u64 tmp_dir;
+
+	tmp_name = fs_path_alloc();
+	if (!tmp_name)
+		return -ENOMEM;
+
+	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
+	if (ret < 0)
+		goto out;
+
+	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = !memcmp(tmp_name->start, name, name_len);
+
+out:
+	fs_path_free(tmp_name);
+	return ret;
+}
+
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
+static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+			      const char *name, int name_len,
+			      u64 *who_ino, u64 *who_gen)
+{
+	int ret = 0;
+	u64 gen;
+	u64 other_inode = 0;
+	u8 other_type = 0;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	ret = is_inode_existent(sctx, dir, dir_gen);
+	if (ret <= 0)
+		goto out;
+
+	/*
+	 * If we have a parent root we need to verify that the parent dir was
+	 * not delted and then re-created, if it was then we have no overwrite
+	 * and we can just unlink this entry.
+	 */
+	if (sctx->parent_root) {
+		ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+		if (gen != dir_gen)
+			goto out;
+	}
+
+	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
+			&other_inode, &other_type);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * Check if the overwritten ref was already processed. If yes, the ref
+	 * was already unlinked/moved, so we can safely assume that we will not
+	 * overwrite anything at this point in time.
+	 */
+	if (other_inode > sctx->send_progress) {
+		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
+				who_gen, NULL, NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+
+		ret = 1;
+		*who_ino = other_inode;
+	} else {
+		ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
+static int did_overwrite_ref(struct send_ctx *sctx,
+			    u64 dir, u64 dir_gen,
+			    u64 ino, u64 ino_gen,
+			    const char *name, int name_len)
+{
+	int ret = 0;
+	u64 gen;
+	u64 ow_inode;
+	u8 other_type;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	ret = is_inode_existent(sctx, dir, dir_gen);
+	if (ret <= 0)
+		goto out;
+
+	/* check if the ref was overwritten by another ref */
+	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
+			&ow_inode, &other_type);
+	if (ret < 0 && ret != -ENOENT)
+		goto out;
+	if (ret) {
+		/* was never and will never be overwritten */
+		ret = 0;
+		goto out;
+	}
+
+	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
+			NULL, NULL);
+	if (ret < 0)
+		goto out;
+
+	if (ow_inode == ino && gen == ino_gen) {
+		ret = 0;
+		goto out;
+	}
+
+	/* we know that it is or will be overwritten. check this now */
+	if (ow_inode < sctx->send_progress)
+		ret = 1;
+	else
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
+static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret = 0;
+	struct fs_path *name = NULL;
+	u64 dir;
+	u64 dir_gen;
+
+	if (!sctx->parent_root)
+		goto out;
+
+	name = fs_path_alloc();
+	if (!name)
+		return -ENOMEM;
+
+	ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
+	if (ret < 0)
+		goto out;
+
+	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+			name->start, fs_path_len(name));
+
+out:
+	fs_path_free(name);
+	return ret;
+}
+
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
+static int name_cache_insert(struct send_ctx *sctx,
+			     struct name_cache_entry *nce)
+{
+	int ret = 0;
+	struct list_head *nce_head;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	if (!nce_head) {
+		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+		if (!nce_head) {
+			kfree(nce);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(nce_head);
+
+		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+		if (ret < 0) {
+			kfree(nce_head);
+			kfree(nce);
+			return ret;
+		}
+	}
+	list_add_tail(&nce->radix_list, nce_head);
+	list_add_tail(&nce->list, &sctx->name_cache_list);
+	sctx->name_cache_size++;
+
+	return ret;
+}
+
+static void name_cache_delete(struct send_ctx *sctx,
+			      struct name_cache_entry *nce)
+{
+	struct list_head *nce_head;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	if (!nce_head) {
+		btrfs_err(sctx->send_root->fs_info,
+	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+			nce->ino, sctx->name_cache_size);
+	}
+
+	list_del(&nce->radix_list);
+	list_del(&nce->list);
+	sctx->name_cache_size--;
+
+	/*
+	 * We may not get to the final release of nce_head if the lookup fails
+	 */
+	if (nce_head && list_empty(nce_head)) {
+		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+		kfree(nce_head);
+	}
+}
+
+static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+						    u64 ino, u64 gen)
+{
+	struct list_head *nce_head;
+	struct name_cache_entry *cur;
+
+	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+	if (!nce_head)
+		return NULL;
+
+	list_for_each_entry(cur, nce_head, radix_list) {
+		if (cur->ino == ino && cur->gen == gen)
+			return cur;
+	}
+	return NULL;
+}
+
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
+static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
+{
+	list_del(&nce->list);
+	list_add_tail(&nce->list, &sctx->name_cache_list);
+}
+
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
+static void name_cache_clean_unused(struct send_ctx *sctx)
+{
+	struct name_cache_entry *nce;
+
+	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
+		return;
+
+	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
+		name_cache_delete(sctx, nce);
+		kfree(nce);
+	}
+}
+
+static void name_cache_free(struct send_ctx *sctx)
+{
+	struct name_cache_entry *nce;
+
+	while (!list_empty(&sctx->name_cache_list)) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
+		name_cache_delete(sctx, nce);
+		kfree(nce);
+	}
+}
+
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
+static int __get_cur_name_and_parent(struct send_ctx *sctx,
+				     u64 ino, u64 gen,
+				     u64 *parent_ino,
+				     u64 *parent_gen,
+				     struct fs_path *dest)
+{
+	int ret;
+	int nce_ret;
+	struct name_cache_entry *nce = NULL;
+
+	/*
+	 * First check if we already did a call to this function with the same
+	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+	 * return the cached result.
+	 */
+	nce = name_cache_search(sctx, ino, gen);
+	if (nce) {
+		if (ino < sctx->send_progress && nce->need_later_update) {
+			name_cache_delete(sctx, nce);
+			kfree(nce);
+			nce = NULL;
+		} else {
+			name_cache_used(sctx, nce);
+			*parent_ino = nce->parent_ino;
+			*parent_gen = nce->parent_gen;
+			ret = fs_path_add(dest, nce->name, nce->name_len);
+			if (ret < 0)
+				goto out;
+			ret = nce->ret;
+			goto out;
+		}
+	}
+
+	/*
+	 * If the inode is not existent yet, add the orphan name and return 1.
+	 * This should only happen for the parent dir that we determine in
+	 * __record_new_ref
+	 */
+	ret = is_inode_existent(sctx, ino, gen);
+	if (ret < 0)
+		goto out;
+
+	if (!ret) {
+		ret = gen_unique_name(sctx, ino, gen, dest);
+		if (ret < 0)
+			goto out;
+		ret = 1;
+		goto out_cache;
+	}
+
+	/*
+	 * Depending on whether the inode was already processed or not, use
+	 * send_root or parent_root for ref lookup.
+	 */
+	if (ino < sctx->send_progress)
+		ret = get_first_ref(sctx->send_root, ino,
+				    parent_ino, parent_gen, dest);
+	else
+		ret = get_first_ref(sctx->parent_root, ino,
+				    parent_ino, parent_gen, dest);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Check if the ref was overwritten by an inode's ref that was processed
+	 * earlier. If yes, treat as orphan and return 1.
+	 */
+	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
+			dest->start, dest->end - dest->start);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		fs_path_reset(dest);
+		ret = gen_unique_name(sctx, ino, gen, dest);
+		if (ret < 0)
+			goto out;
+		ret = 1;
+	}
+
+out_cache:
+	/*
+	 * Store the result of the lookup in the name cache.
+	 */
+	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
+	if (!nce) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nce->ino = ino;
+	nce->gen = gen;
+	nce->parent_ino = *parent_ino;
+	nce->parent_gen = *parent_gen;
+	nce->name_len = fs_path_len(dest);
+	nce->ret = ret;
+	strcpy(nce->name, dest->start);
+
+	if (ino < sctx->send_progress)
+		nce->need_later_update = 0;
+	else
+		nce->need_later_update = 1;
+
+	nce_ret = name_cache_insert(sctx, nce);
+	if (nce_ret < 0)
+		ret = nce_ret;
+	name_cache_clean_unused(sctx);
+
+out:
+	return ret;
+}
+
+/*
+ * Magic happens here. This function returns the first ref to an inode as it
+ * would look like while receiving the stream at this point in time.
+ * We walk the path up to the root. For every inode in between, we check if it
+ * was already processed/sent. If yes, we continue with the parent as found
+ * in send_root. If not, we continue with the parent as found in parent_root.
+ * If we encounter an inode that was deleted at this point in time, we use the
+ * inodes "orphan" name instead of the real name and stop. Same with new inodes
+ * that were not created yet and overwritten inodes/refs.
+ *
+ * When do we have have orphan inodes:
+ * 1. When an inode is freshly created and thus no valid refs are available yet
+ * 2. When a directory lost all it's refs (deleted) but still has dir items
+ *    inside which were not processed yet (pending for move/delete). If anyone
+ *    tried to get the path to the dir items, it would get a path inside that
+ *    orphan directory.
+ * 3. When an inode is moved around or gets new links, it may overwrite the ref
+ *    of an unprocessed inode. If in that case the first ref would be
+ *    overwritten, the overwritten inode gets "orphanized". Later when we
+ *    process this overwritten inode, it is restored at a new place by moving
+ *    the orphan inode.
+ *
+ * sctx->send_progress tells this function at which point in time receiving
+ * would be.
+ */
+static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
+			struct fs_path *dest)
+{
+	int ret = 0;
+	struct fs_path *name = NULL;
+	u64 parent_inode = 0;
+	u64 parent_gen = 0;
+	int stop = 0;
+
+	name = fs_path_alloc();
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dest->reversed = 1;
+	fs_path_reset(dest);
+
+	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+		fs_path_reset(name);
+
+		if (is_waiting_for_rm(sctx, ino)) {
+			ret = gen_unique_name(sctx, ino, gen, name);
+			if (ret < 0)
+				goto out;
+			ret = fs_path_add_path(dest, name);
+			break;
+		}
+
+		if (is_waiting_for_move(sctx, ino)) {
+			ret = get_first_ref(sctx->parent_root, ino,
+					    &parent_inode, &parent_gen, name);
+		} else {
+			ret = __get_cur_name_and_parent(sctx, ino, gen,
+							&parent_inode,
+							&parent_gen, name);
+			if (ret)
+				stop = 1;
+		}
+
+		if (ret < 0)
+			goto out;
+
+		ret = fs_path_add_path(dest, name);
+		if (ret < 0)
+			goto out;
+
+		ino = parent_inode;
+		gen = parent_gen;
+	}
+
+out:
+	fs_path_free(name);
+	if (!ret)
+		fs_path_unreverse(dest);
+	return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
+ */
+static int send_subvol_begin(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *send_root = sctx->send_root;
+	struct btrfs_root *parent_root = sctx->parent_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	char *name = NULL;
+	int namelen;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
+	if (!name) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	key.objectid = send_root->objectid;
+	key.type = BTRFS_ROOT_BACKREF_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
+				&key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
+	    key.objectid != send_root->objectid) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	namelen = btrfs_root_ref_name_len(leaf, ref);
+	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
+	btrfs_release_path(path);
+
+	if (parent_root) {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
+		if (ret < 0)
+			goto out;
+	} else {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
+		if (ret < 0)
+			goto out;
+	}
+
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
+	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+			sctx->send_root->root_item.uuid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
+		    le64_to_cpu(sctx->send_root->root_item.ctransid));
+	if (parent_root) {
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				sctx->parent_root->root_item.uuid);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+			    le64_to_cpu(sctx->parent_root->root_item.ctransid));
+	}
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	btrfs_free_path(path);
+	kfree(name);
+	return ret;
+}
+
+static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+	int ret = 0;
+	struct fs_path *p = NULL;
+	struct btrfs_inode_item *ii;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	struct btrfs_key key;
+	int slot;
+
+verbose_printk("btrfs: send_utimes %llu\n", ino);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
+			btrfs_inode_atime(ii));
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
+			btrfs_inode_mtime(ii));
+	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
+			btrfs_inode_ctime(ii));
+	/* TODO Add otime support when the otime patches get into upstream */
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
+ * a valid path yet because we did not process the refs yet. So, the inode
+ * is created as orphan.
+ */
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
+{
+	int ret = 0;
+	struct fs_path *p;
+	int cmd;
+	u64 gen;
+	u64 mode;
+	u64 rdev;
+
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	if (ino != sctx->cur_ino) {
+		ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+				     NULL, NULL, &rdev);
+		if (ret < 0)
+			goto out;
+	} else {
+		gen = sctx->cur_inode_gen;
+		mode = sctx->cur_inode_mode;
+		rdev = sctx->cur_inode_rdev;
+	}
+
+	if (S_ISREG(mode)) {
+		cmd = BTRFS_SEND_C_MKFILE;
+	} else if (S_ISDIR(mode)) {
+		cmd = BTRFS_SEND_C_MKDIR;
+	} else if (S_ISLNK(mode)) {
+		cmd = BTRFS_SEND_C_SYMLINK;
+	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		cmd = BTRFS_SEND_C_MKNOD;
+	} else if (S_ISFIFO(mode)) {
+		cmd = BTRFS_SEND_C_MKFIFO;
+	} else if (S_ISSOCK(mode)) {
+		cmd = BTRFS_SEND_C_MKSOCK;
+	} else {
+		printk(KERN_WARNING "btrfs: unexpected inode type %o",
+				(int)(mode & S_IFMT));
+		ret = -ENOTSUPP;
+		goto out;
+	}
+
+	ret = begin_cmd(sctx, cmd);
+	if (ret < 0)
+		goto out;
+
+	ret = gen_unique_name(sctx, ino, gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
+
+	if (S_ISLNK(mode)) {
+		fs_path_reset(p);
+		ret = read_symlink(sctx->send_root, ino, p);
+		if (ret < 0)
+			goto out;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
+	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
+		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
+	}
+
+	ret = send_cmd(sctx);
+	if (ret < 0)
+		goto out;
+
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key di_key;
+	struct extent_buffer *eb;
+	struct btrfs_dir_item *di;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(sctx->send_root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
+		    di_key.objectid < sctx->send_progress) {
+			ret = 1;
+			goto out;
+		}
+
+		path->slots[0]++;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ *    directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+	int ret;
+
+	if (S_ISDIR(sctx->cur_inode_mode)) {
+		ret = did_create_dir(sctx, sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	ret = send_create_inode(sctx, sctx->cur_ino);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
+struct recorded_ref {
+	struct list_head list;
+	char *dir_path;
+	char *name;
+	struct fs_path *full_path;
+	u64 dir;
+	u64 dir_gen;
+	int dir_path_len;
+	int name_len;
+};
+
+/*
+ * We need to process new refs before deleted refs, but compare_tree gives us
+ * everything mixed. So we first record all refs and later process them.
+ * This function is a helper to record one ref.
+ */
+static int __record_ref(struct list_head *head, u64 dir,
+		      u64 dir_gen, struct fs_path *path)
+{
+	struct recorded_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->dir = dir;
+	ref->dir_gen = dir_gen;
+	ref->full_path = path;
+
+	ref->name = (char *)kbasename(ref->full_path->start);
+	ref->name_len = ref->full_path->end - ref->name;
+	ref->dir_path = ref->full_path->start;
+	if (ref->name == ref->full_path->start)
+		ref->dir_path_len = 0;
+	else
+		ref->dir_path_len = ref->full_path->end -
+				ref->full_path->start - 1 - ref->name_len;
+
+	list_add_tail(&ref->list, head);
+	return 0;
+}
+
+static int dup_ref(struct recorded_ref *ref, struct list_head *list)
+{
+	struct recorded_ref *new;
+
+	new = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
+	new->dir = ref->dir;
+	new->dir_gen = ref->dir_gen;
+	new->full_path = NULL;
+	INIT_LIST_HEAD(&new->list);
+	list_add_tail(&new->list, list);
+	return 0;
+}
+
+static void __free_recorded_refs(struct list_head *head)
+{
+	struct recorded_ref *cur;
+
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, struct recorded_ref, list);
+		fs_path_free(cur->full_path);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+static void free_recorded_refs(struct send_ctx *sctx)
+{
+	__free_recorded_refs(&sctx->new_refs);
+	__free_recorded_refs(&sctx->deleted_refs);
+}
+
+/*
+ * Renames/moves a file/dir to its orphan name. Used when the first
+ * ref of an unprocessed inode gets overwritten and for all non empty
+ * directories.
+ */
+static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
+			  struct fs_path *path)
+{
+	int ret;
+	struct fs_path *orphan;
+
+	orphan = fs_path_alloc();
+	if (!orphan)
+		return -ENOMEM;
+
+	ret = gen_unique_name(sctx, ino, gen, orphan);
+	if (ret < 0)
+		goto out;
+
+	ret = send_rename(sctx, path, orphan);
+
+out:
+	fs_path_free(orphan);
+	return ret;
+}
+
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node **p = &sctx->orphan_dirs.rb_node;
+	struct rb_node *parent = NULL;
+	struct orphan_dir_info *entry, *odi;
+
+	odi = kmalloc(sizeof(*odi), GFP_NOFS);
+	if (!odi)
+		return ERR_PTR(-ENOMEM);
+	odi->ino = dir_ino;
+	odi->gen = 0;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (dir_ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(odi);
+			return entry;
+		}
+	}
+
+	rb_link_node(&odi->node, parent, p);
+	rb_insert_color(&odi->node, &sctx->orphan_dirs);
+	return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node *n = sctx->orphan_dirs.rb_node;
+	struct orphan_dir_info *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino)
+			n = n->rb_left;
+		else if (dir_ino > entry->ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+	return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+				 struct orphan_dir_info *odi)
+{
+	if (!odi)
+		return;
+	rb_erase(&odi->node, &sctx->orphan_dirs);
+	kfree(odi);
+}
+
+/*
+ * Returns 1 if a directory can be removed at this point in time.
+ * We check this by iterating all dir items and checking if the inode behind
+ * the dir item was already processed.
+ */
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+		     u64 send_progress)
+{
+	int ret = 0;
+	struct btrfs_root *root = sctx->parent_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key loc;
+	struct btrfs_dir_item *di;
+
+	/*
+	 * Don't try to rmdir the top/root subvolume dir.
+	 */
+	if (dir == BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		struct waiting_dir_move *dm;
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type)
+			break;
+
+		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+
+		dm = get_waiting_dir_move(sctx, loc.objectid);
+		if (dm) {
+			struct orphan_dir_info *odi;
+
+			odi = add_orphan_dir_info(sctx, dir);
+			if (IS_ERR(odi)) {
+				ret = PTR_ERR(odi);
+				goto out;
+			}
+			odi->gen = dir_gen;
+			dm->rmdir_ino = dir;
+			ret = 0;
+			goto out;
+		}
+
+		if (loc.objectid > send_progress) {
+			ret = 0;
+			goto out;
+		}
+
+		path->slots[0]++;
+	}
+
+	ret = 1;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
+
+	return entry != NULL;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+	struct rb_node *parent = NULL;
+	struct waiting_dir_move *entry, *dm;
+
+	dm = kmalloc(sizeof(*dm), GFP_NOFS);
+	if (!dm)
+		return -ENOMEM;
+	dm->ino = ino;
+	dm->rmdir_ino = 0;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct waiting_dir_move, node);
+		if (ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(dm);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&dm->node, parent, p);
+	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+	return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+	struct waiting_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct waiting_dir_move, node);
+		if (ino < entry->ino)
+			n = n->rb_left;
+		else if (ino > entry->ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+				  struct waiting_dir_move *dm)
+{
+	if (!dm)
+		return;
+	rb_erase(&dm->node, &sctx->waiting_dir_moves);
+	kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx,
+				u64 ino,
+				u64 ino_gen,
+				u64 parent_ino,
+				struct list_head *new_refs,
+				struct list_head *deleted_refs)
+{
+	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+	struct rb_node *parent = NULL;
+	struct pending_dir_move *entry = NULL, *pm;
+	struct recorded_ref *cur;
+	int exists = 0;
+	int ret;
+
+	pm = kmalloc(sizeof(*pm), GFP_NOFS);
+	if (!pm)
+		return -ENOMEM;
+	pm->parent_ino = parent_ino;
+	pm->ino = ino;
+	pm->gen = ino_gen;
+	INIT_LIST_HEAD(&pm->list);
+	INIT_LIST_HEAD(&pm->update_refs);
+	RB_CLEAR_NODE(&pm->node);
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct pending_dir_move, node);
+		if (parent_ino < entry->parent_ino) {
+			p = &(*p)->rb_left;
+		} else if (parent_ino > entry->parent_ino) {
+			p = &(*p)->rb_right;
+		} else {
+			exists = 1;
+			break;
+		}
+	}
+
+	list_for_each_entry(cur, deleted_refs, list) {
+		ret = dup_ref(cur, &pm->update_refs);
+		if (ret < 0)
+			goto out;
+	}
+	list_for_each_entry(cur, new_refs, list) {
+		ret = dup_ref(cur, &pm->update_refs);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = add_waiting_dir_move(sctx, pm->ino);
+	if (ret)
+		goto out;
+
+	if (exists) {
+		list_add_tail(&pm->list, &entry->list);
+	} else {
+		rb_link_node(&pm->node, parent, p);
+		rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+	}
+	ret = 0;
+out:
+	if (ret) {
+		__free_recorded_refs(&pm->update_refs);
+		kfree(pm);
+	}
+	return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+						      u64 parent_ino)
+{
+	struct rb_node *n = sctx->pending_dir_moves.rb_node;
+	struct pending_dir_move *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct pending_dir_move, node);
+		if (parent_ino < entry->parent_ino)
+			n = n->rb_left;
+		else if (parent_ino > entry->parent_ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+		     u64 ino, u64 gen, u64 *ancestor_ino)
+{
+	int ret = 0;
+	u64 parent_inode = 0;
+	u64 parent_gen = 0;
+	u64 start_ino = ino;
+
+	*ancestor_ino = 0;
+	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+		fs_path_reset(name);
+
+		if (is_waiting_for_rm(sctx, ino))
+			break;
+		if (is_waiting_for_move(sctx, ino)) {
+			if (*ancestor_ino == 0)
+				*ancestor_ino = ino;
+			ret = get_first_ref(sctx->parent_root, ino,
+					    &parent_inode, &parent_gen, name);
+		} else {
+			ret = __get_cur_name_and_parent(sctx, ino, gen,
+							&parent_inode,
+							&parent_gen, name);
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+		if (ret < 0)
+			break;
+		if (parent_inode == start_ino) {
+			ret = 1;
+			if (*ancestor_ino == 0)
+				*ancestor_ino = ino;
+			break;
+		}
+		ino = parent_inode;
+		gen = parent_gen;
+	}
+	return ret;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+	struct fs_path *from_path = NULL;
+	struct fs_path *to_path = NULL;
+	struct fs_path *name = NULL;
+	u64 orig_progress = sctx->send_progress;
+	struct recorded_ref *cur;
+	u64 parent_ino, parent_gen;
+	struct waiting_dir_move *dm = NULL;
+	u64 rmdir_ino = 0;
+	int ret;
+	u64 ancestor = 0;
+
+	name = fs_path_alloc();
+	from_path = fs_path_alloc();
+	if (!name || !from_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dm = get_waiting_dir_move(sctx, pm->ino);
+	ASSERT(dm);
+	rmdir_ino = dm->rmdir_ino;
+	free_waiting_dir_move(sctx, dm);
+
+	ret = get_first_ref(sctx->parent_root, pm->ino,
+			    &parent_ino, &parent_gen, name);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, parent_ino, parent_gen,
+			   from_path);
+	if (ret < 0)
+		goto out;
+	ret = fs_path_add_path(from_path, name);
+	if (ret < 0)
+		goto out;
+
+	sctx->send_progress = sctx->cur_ino + 1;
+	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+	if (ret) {
+		LIST_HEAD(deleted_refs);
+		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+					   &pm->update_refs, &deleted_refs);
+		if (ret < 0)
+			goto out;
+		if (rmdir_ino) {
+			dm = get_waiting_dir_move(sctx, pm->ino);
+			ASSERT(dm);
+			dm->rmdir_ino = rmdir_ino;
+		}
+		goto out;
+	}
+	fs_path_reset(name);
+	to_path = name;
+	name = NULL;
+	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+	if (ret < 0)
+		goto out;
+
+	ret = send_rename(sctx, from_path, to_path);
+	if (ret < 0)
+		goto out;
+
+	if (rmdir_ino) {
+		struct orphan_dir_info *odi;
+
+		odi = get_orphan_dir_info(sctx, rmdir_ino);
+		if (!odi) {
+			/* already deleted */
+			goto finish;
+		}
+		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+		if (ret < 0)
+			goto out;
+		if (!ret)
+			goto finish;
+
+		name = fs_path_alloc();
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+		if (ret < 0)
+			goto out;
+		ret = send_rmdir(sctx, name);
+		if (ret < 0)
+			goto out;
+		free_orphan_dir_info(sctx, odi);
+	}
+
+finish:
+	ret = send_utimes(sctx, pm->ino, pm->gen);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * After rename/move, need to update the utimes of both new parent(s)
+	 * and old parent(s).
+	 */
+	list_for_each_entry(cur, &pm->update_refs, list) {
+		if (cur->dir == rmdir_ino)
+			continue;
+		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	fs_path_free(name);
+	fs_path_free(from_path);
+	fs_path_free(to_path);
+	sctx->send_progress = orig_progress;
+
+	return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+	if (!list_empty(&m->list))
+		list_del(&m->list);
+	if (!RB_EMPTY_NODE(&m->node))
+		rb_erase(&m->node, &sctx->pending_dir_moves);
+	__free_recorded_refs(&m->update_refs);
+	kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+				      struct list_head *stack)
+{
+	if (list_empty(&moves->list)) {
+		list_add_tail(&moves->list, stack);
+	} else {
+		LIST_HEAD(list);
+		list_splice_init(&moves->list, &list);
+		list_add_tail(&moves->list, stack);
+		list_splice_tail(&list, stack);
+	}
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+	struct pending_dir_move *pm;
+	struct list_head stack;
+	u64 parent_ino = sctx->cur_ino;
+	int ret = 0;
+
+	pm = get_pending_dir_moves(sctx, parent_ino);
+	if (!pm)
+		return 0;
+
+	INIT_LIST_HEAD(&stack);
+	tail_append_pending_moves(pm, &stack);
+
+	while (!list_empty(&stack)) {
+		pm = list_first_entry(&stack, struct pending_dir_move, list);
+		parent_ino = pm->ino;
+		ret = apply_dir_move(sctx, pm);
+		free_pending_move(sctx, pm);
+		if (ret)
+			goto out;
+		pm = get_pending_dir_moves(sctx, parent_ino);
+		if (pm)
+			tail_append_pending_moves(pm, &stack);
+	}
+	return 0;
+
+out:
+	while (!list_empty(&stack)) {
+		pm = list_first_entry(&stack, struct pending_dir_move, list);
+		free_pending_move(sctx, pm);
+	}
+	return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+				struct recorded_ref *parent_ref)
+{
+	int ret = 0;
+	u64 ino = parent_ref->dir;
+	u64 parent_ino_before, parent_ino_after;
+	struct fs_path *path_before = NULL;
+	struct fs_path *path_after = NULL;
+	int len1, len2;
+
+	path_after = fs_path_alloc();
+	path_before = fs_path_alloc();
+	if (!path_after || !path_before) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Our current directory inode may not yet be renamed/moved because some
+	 * ancestor (immediate or not) has to be renamed/moved first. So find if
+	 * such ancestor exists and make sure our own rename/move happens after
+	 * that ancestor is processed.
+	 */
+	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+		if (is_waiting_for_move(sctx, ino)) {
+			ret = 1;
+			break;
+		}
+
+		fs_path_reset(path_before);
+		fs_path_reset(path_after);
+
+		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+				    NULL, path_after);
+		if (ret < 0)
+			goto out;
+		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+				    NULL, path_before);
+		if (ret < 0 && ret != -ENOENT) {
+			goto out;
+		} else if (ret == -ENOENT) {
+			ret = 1;
+			break;
+		}
+
+		len1 = fs_path_len(path_before);
+		len2 = fs_path_len(path_after);
+		if (ino > sctx->cur_ino &&
+		    (parent_ino_before != parent_ino_after || len1 != len2 ||
+		     memcmp(path_before->start, path_after->start, len1))) {
+			ret = 1;
+			break;
+		}
+		ino = parent_ino_after;
+	}
+
+out:
+	fs_path_free(path_before);
+	fs_path_free(path_after);
+
+	if (ret == 1) {
+		ret = add_pending_dir_move(sctx,
+					   sctx->cur_ino,
+					   sctx->cur_inode_gen,
+					   ino,
+					   &sctx->new_refs,
+					   &sctx->deleted_refs);
+		if (!ret)
+			ret = 1;
+	}
+
+	return ret;
+}
+
+/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+{
+	int ret = 0;
+	struct recorded_ref *cur;
+	struct recorded_ref *cur2;
+	struct list_head check_dirs;
+	struct fs_path *valid_path = NULL;
+	u64 ow_inode = 0;
+	u64 ow_gen;
+	int did_overwrite = 0;
+	int is_orphan = 0;
+	u64 last_dir_ino_rm = 0;
+
+verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+
+	/*
+	 * This should never happen as the root dir always has the same ref
+	 * which is always '..'
+	 */
+	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+	INIT_LIST_HEAD(&check_dirs);
+
+	valid_path = fs_path_alloc();
+	if (!valid_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * First, check if the first ref of the current inode was overwritten
+	 * before. If yes, we know that the current inode was already orphanized
+	 * and thus use the orphan name. If not, we can use get_cur_path to
+	 * get the path of the first ref as it would like while receiving at
+	 * this point in time.
+	 * New inodes are always orphan at the beginning, so force to use the
+	 * orphan name in this case.
+	 * The first ref is stored in valid_path and will be updated if it
+	 * gets moved around.
+	 */
+	if (!sctx->cur_inode_new) {
+		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
+				sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			did_overwrite = 1;
+	}
+	if (sctx->cur_inode_new || did_overwrite) {
+		ret = gen_unique_name(sctx, sctx->cur_ino,
+				sctx->cur_inode_gen, valid_path);
+		if (ret < 0)
+			goto out;
+		is_orphan = 1;
+	} else {
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				valid_path);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_for_each_entry(cur, &sctx->new_refs, list) {
+		/*
+		 * We may have refs where the parent directory does not exist
+		 * yet. This happens if the parent directories inum is higher
+		 * the the current inum. To handle this case, we create the
+		 * parent directory out of order. But we need to check if this
+		 * did already happen before due to other refs in the same dir.
+		 */
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+		if (ret == inode_state_will_create) {
+			ret = 0;
+			/*
+			 * First check if any of the current inodes refs did
+			 * already create the dir.
+			 */
+			list_for_each_entry(cur2, &sctx->new_refs, list) {
+				if (cur == cur2)
+					break;
+				if (cur2->dir == cur->dir) {
+					ret = 1;
+					break;
+				}
+			}
+
+			/*
+			 * If that did not happen, check if a previous inode
+			 * did already create the dir.
+			 */
+			if (!ret)
+				ret = did_create_dir(sctx, cur->dir);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_create_inode(sctx, cur->dir);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		/*
+		 * Check if this new ref would overwrite the first ref of
+		 * another unprocessed inode. If yes, orphanize the
+		 * overwritten inode. If we find an overwritten ref that is
+		 * not the first ref, simply unlink it.
+		 */
+		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+				cur->name, cur->name_len,
+				&ow_inode, &ow_gen);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = is_first_ref(sctx->parent_root,
+					   ow_inode, cur->dir, cur->name,
+					   cur->name_len);
+			if (ret < 0)
+				goto out;
+			if (ret) {
+				ret = orphanize_inode(sctx, ow_inode, ow_gen,
+						cur->full_path);
+				if (ret < 0)
+					goto out;
+			} else {
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
+		/*
+		 * link/move the ref to the new place. If we have an orphan
+		 * inode, move it and update valid_path. If not, link or move
+		 * it depending on the inode mode.
+		 */
+		if (is_orphan) {
+			ret = send_rename(sctx, valid_path, cur->full_path);
+			if (ret < 0)
+				goto out;
+			is_orphan = 0;
+			ret = fs_path_copy(valid_path, cur->full_path);
+			if (ret < 0)
+				goto out;
+		} else {
+			if (S_ISDIR(sctx->cur_inode_mode)) {
+				/*
+				 * Dirs can't be linked, so move it. For moved
+				 * dirs, we always have one new and one deleted
+				 * ref. The deleted ref is ignored later.
+				 */
+				ret = wait_for_parent_move(sctx, cur);
+				if (ret < 0)
+					goto out;
+				if (ret) {
+					*pending_move = 1;
+				} else {
+					ret = send_rename(sctx, valid_path,
+							  cur->full_path);
+					if (!ret)
+						ret = fs_path_copy(valid_path,
+							       cur->full_path);
+				}
+				if (ret < 0)
+					goto out;
+			} else {
+				ret = send_link(sctx, cur->full_path,
+						valid_path);
+				if (ret < 0)
+					goto out;
+			}
+		}
+		ret = dup_ref(cur, &check_dirs);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
+		/*
+		 * Check if we can already rmdir the directory. If not,
+		 * orphanize it. For every dir item inside that gets deleted
+		 * later, we do this check again and rmdir it then if possible.
+		 * See the use of check_dirs for more details.
+		 */
+		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = send_rmdir(sctx, valid_path);
+			if (ret < 0)
+				goto out;
+		} else if (!is_orphan) {
+			ret = orphanize_inode(sctx, sctx->cur_ino,
+					sctx->cur_inode_gen, valid_path);
+			if (ret < 0)
+				goto out;
+			is_orphan = 1;
+		}
+
+		list_for_each_entry(cur, &sctx->deleted_refs, list) {
+			ret = dup_ref(cur, &check_dirs);
+			if (ret < 0)
+				goto out;
+		}
+	} else if (S_ISDIR(sctx->cur_inode_mode) &&
+		   !list_empty(&sctx->deleted_refs)) {
+		/*
+		 * We have a moved dir. Add the old parent to check_dirs
+		 */
+		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+				list);
+		ret = dup_ref(cur, &check_dirs);
+		if (ret < 0)
+			goto out;
+	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
+		/*
+		 * We have a non dir inode. Go through all deleted refs and
+		 * unlink them if they were not already overwritten by other
+		 * inodes.
+		 */
+		list_for_each_entry(cur, &sctx->deleted_refs, list) {
+			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino, sctx->cur_inode_gen,
+					cur->name, cur->name_len);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
+			}
+			ret = dup_ref(cur, &check_dirs);
+			if (ret < 0)
+				goto out;
+		}
+		/*
+		 * If the inode is still orphan, unlink the orphan. This may
+		 * happen when a previous inode did overwrite the first ref
+		 * of this inode and no new refs were added for the current
+		 * inode. Unlinking does not mean that the inode is deleted in
+		 * all cases. There may still be links to this inode in other
+		 * places.
+		 */
+		if (is_orphan) {
+			ret = send_unlink(sctx, valid_path);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	/*
+	 * We did collect all parent dirs where cur_inode was once located. We
+	 * now go through all these dirs and check if they are pending for
+	 * deletion and if it's finally possible to perform the rmdir now.
+	 * We also update the inode stats of the parent dirs here.
+	 */
+	list_for_each_entry(cur, &check_dirs, list) {
+		/*
+		 * In case we had refs into dirs that were not processed yet,
+		 * we don't need to do the utime and rmdir logic for these dirs.
+		 * The dir will be processed later.
+		 */
+		if (cur->dir > sctx->cur_ino)
+			continue;
+
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+
+		if (ret == inode_state_did_create ||
+		    ret == inode_state_no_change) {
+			/* TODO delayed utimes */
+			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+			if (ret < 0)
+				goto out;
+		} else if (ret == inode_state_did_delete &&
+			   cur->dir != last_dir_ino_rm) {
+			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino);
+			if (ret < 0)
+				goto out;
+			if (ret) {
+				ret = get_cur_path(sctx, cur->dir,
+						   cur->dir_gen, valid_path);
+				if (ret < 0)
+					goto out;
+				ret = send_rmdir(sctx, valid_path);
+				if (ret < 0)
+					goto out;
+				last_dir_ino_rm = cur->dir;
+			}
+		}
+	}
+
+	ret = 0;
+
+out:
+	__free_recorded_refs(&check_dirs);
+	free_recorded_refs(sctx);
+	fs_path_free(valid_path);
+	return ret;
+}
+
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+		      struct fs_path *name, void *ctx, struct list_head *refs)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	u64 gen;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
+			NULL, NULL);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, dir, gen, p);
+	if (ret < 0)
+		goto out;
+	ret = fs_path_add_path(p, name);
+	if (ret < 0)
+		goto out;
+
+	ret = __record_ref(refs, dir, gen, p);
+
+out:
+	if (ret)
+		fs_path_free(p);
+	return ret;
+}
+
+static int __record_new_ref(int num, u64 dir, int index,
+			    struct fs_path *name,
+			    void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	return record_ref(sctx->send_root, num, dir, index, name,
+			  ctx, &sctx->new_refs);
+}
+
+
+static int __record_deleted_ref(int num, u64 dir, int index,
+				struct fs_path *name,
+				void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	return record_ref(sctx->parent_root, num, dir, index, name,
+			  ctx, &sctx->deleted_refs);
+}
+
+static int record_new_ref(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+				sctx->cmp_key, 0, __record_new_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int record_deleted_ref(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+				sctx->cmp_key, 0, __record_deleted_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	return ret;
+}
+
+struct find_ref_ctx {
+	u64 dir;
+	u64 dir_gen;
+	struct btrfs_root *root;
+	struct fs_path *name;
+	int found_idx;
+};
+
+static int __find_iref(int num, u64 dir, int index,
+		       struct fs_path *name,
+		       void *ctx_)
+{
+	struct find_ref_ctx *ctx = ctx_;
+	u64 dir_gen;
+	int ret;
+
+	if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
+	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
+		/*
+		 * To avoid doing extra lookups we'll only do this if everything
+		 * else matches.
+		 */
+		ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret)
+			return ret;
+		if (dir_gen != ctx->dir_gen)
+			return 0;
+		ctx->found_idx = num;
+		return 1;
+	}
+	return 0;
+}
+
+static int find_iref(struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *key,
+		     u64 dir, u64 dir_gen, struct fs_path *name)
+{
+	int ret;
+	struct find_ref_ctx ctx;
+
+	ctx.dir = dir;
+	ctx.name = name;
+	ctx.dir_gen = dir_gen;
+	ctx.found_idx = -1;
+	ctx.root = root;
+
+	ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
+	if (ret < 0)
+		return ret;
+
+	if (ctx.found_idx == -1)
+		return -ENOENT;
+
+	return ctx.found_idx;
+}
+
+static int __record_changed_new_ref(int num, u64 dir, int index,
+				    struct fs_path *name,
+				    void *ctx)
+{
+	u64 dir_gen;
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = find_iref(sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, dir, dir_gen, name);
+	if (ret == -ENOENT)
+		ret = __record_new_ref(num, dir, index, name, sctx);
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int __record_changed_deleted_ref(int num, u64 dir, int index,
+					struct fs_path *name,
+					void *ctx)
+{
+	u64 dir_gen;
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+			dir, dir_gen, name);
+	if (ret == -ENOENT)
+		ret = __record_deleted_ref(num, dir, index, name, sctx);
+	else if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int record_changed_ref(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * Record and process all refs at once. Needed when an inode changes the
+ * generation number, which means that it was deleted and recreated.
+ */
+static int process_all_refs(struct send_ctx *sctx,
+			    enum btrfs_compare_tree_result cmd)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+	iterate_inode_ref_t cb;
+	int pending_move = 0;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	if (cmd == BTRFS_COMPARE_TREE_NEW) {
+		root = sctx->send_root;
+		cb = __record_new_ref;
+	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
+		root = sctx->parent_root;
+		cb = __record_deleted_ref;
+	} else {
+		btrfs_err(sctx->send_root->fs_info,
+				"Wrong command %d in process_all_refs", cmd);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    (found_key.type != BTRFS_INODE_REF_KEY &&
+		     found_key.type != BTRFS_INODE_EXTREF_KEY))
+			break;
+
+		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+		if (ret < 0)
+			goto out;
+
+		path->slots[0]++;
+	}
+	btrfs_release_path(path);
+
+	ret = process_recorded_refs(sctx, &pending_move);
+	/* Only applicable to an incremental send. */
+	ASSERT(pending_move == 0);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int send_set_xattr(struct send_ctx *sctx,
+			  struct fs_path *path,
+			  const char *name, int name_len,
+			  const char *data, int data_len)
+{
+	int ret = 0;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+static int send_remove_xattr(struct send_ctx *sctx,
+			  struct fs_path *path,
+			  const char *name, int name_len)
+{
+	int ret = 0;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	return ret;
+}
+
+static int __process_new_xattr(int num, struct btrfs_key *di_key,
+			       const char *name, int name_len,
+			       const char *data, int data_len,
+			       u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+	posix_acl_xattr_header dummy_acl;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	/*
+	 * This hack is needed because empty acl's are stored as zero byte
+	 * data in xattrs. Problem with that is, that receiving these zero byte
+	 * acl's will fail later. To fix this, we send a dummy acl list that
+	 * only contains the version number and no entries.
+	 */
+	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
+	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
+		if (data_len == 0) {
+			dummy_acl.a_version =
+					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+			data = (char *)&dummy_acl;
+			data_len = sizeof(dummy_acl);
+		}
+	}
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
+
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
+				   const char *name, int name_len,
+				   const char *data, int data_len,
+				   u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	ret = send_remove_xattr(sctx, p, name, name_len);
+
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int process_new_xattr(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+			       sctx->cmp_key, __process_new_xattr, sctx);
+
+	return ret;
+}
+
+static int process_deleted_xattr(struct send_ctx *sctx)
+{
+	int ret;
+
+	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+			       sctx->cmp_key, __process_deleted_xattr, sctx);
+
+	return ret;
+}
+
+struct find_xattr_ctx {
+	const char *name;
+	int name_len;
+	int found_idx;
+	char *found_data;
+	int found_data_len;
+};
+
+static int __find_xattr(int num, struct btrfs_key *di_key,
+			const char *name, int name_len,
+			const char *data, int data_len,
+			u8 type, void *vctx)
+{
+	struct find_xattr_ctx *ctx = vctx;
+
+	if (name_len == ctx->name_len &&
+	    strncmp(name, ctx->name, name_len) == 0) {
+		ctx->found_idx = num;
+		ctx->found_data_len = data_len;
+		ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
+		if (!ctx->found_data)
+			return -ENOMEM;
+		return 1;
+	}
+	return 0;
+}
+
+static int find_xattr(struct btrfs_root *root,
+		      struct btrfs_path *path,
+		      struct btrfs_key *key,
+		      const char *name, int name_len,
+		      char **data, int *data_len)
+{
+	int ret;
+	struct find_xattr_ctx ctx;
+
+	ctx.name = name;
+	ctx.name_len = name_len;
+	ctx.found_idx = -1;
+	ctx.found_data = NULL;
+	ctx.found_data_len = 0;
+
+	ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
+	if (ret < 0)
+		return ret;
+
+	if (ctx.found_idx == -1)
+		return -ENOENT;
+	if (data) {
+		*data = ctx.found_data;
+		*data_len = ctx.found_data_len;
+	} else {
+		kfree(ctx.found_data);
+	}
+	return ctx.found_idx;
+}
+
+
+static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
+				       const char *name, int name_len,
+				       const char *data, int data_len,
+				       u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+	char *found_data = NULL;
+	int found_data_len  = 0;
+
+	ret = find_xattr(sctx->parent_root, sctx->right_path,
+			 sctx->cmp_key, name, name_len, &found_data,
+			 &found_data_len);
+	if (ret == -ENOENT) {
+		ret = __process_new_xattr(num, di_key, name, name_len, data,
+				data_len, type, ctx);
+	} else if (ret >= 0) {
+		if (data_len != found_data_len ||
+		    memcmp(data, found_data, data_len)) {
+			ret = __process_new_xattr(num, di_key, name, name_len,
+					data, data_len, type, ctx);
+		} else {
+			ret = 0;
+		}
+	}
+
+	kfree(found_data);
+	return ret;
+}
+
+static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
+					   const char *name, int name_len,
+					   const char *data, int data_len,
+					   u8 type, void *ctx)
+{
+	int ret;
+	struct send_ctx *sctx = ctx;
+
+	ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+			 name, name_len, NULL, NULL);
+	if (ret == -ENOENT)
+		ret = __process_deleted_xattr(num, di_key, name, name_len, data,
+				data_len, type, ctx);
+	else if (ret >= 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int process_changed_xattr(struct send_ctx *sctx)
+{
+	int ret = 0;
+
+	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+			sctx->cmp_key, __process_changed_new_xattr, sctx);
+	if (ret < 0)
+		goto out;
+	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
+
+out:
+	return ret;
+}
+
+static int process_all_new_xattrs(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	root = sctx->send_root;
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = iterate_dir_item(root, path, &found_key,
+				       __process_new_xattr, sctx);
+		if (ret < 0)
+			goto out;
+
+		path->slots[0]++;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct inode *inode;
+	struct page *page;
+	char *addr;
+	struct btrfs_key key;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+	pgoff_t last_index;
+	unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+	ssize_t ret = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (offset + len > i_size_read(inode)) {
+		if (offset > i_size_read(inode))
+			len = 0;
+		else
+			len = offset - i_size_read(inode);
+	}
+	if (len == 0)
+		goto out;
+
+	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* initial readahead */
+	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+	file_ra_state_init(&sctx->ra, inode->i_mapping);
+	btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+		       last_index - index + 1);
+
+	while (index <= last_index) {
+		unsigned cur_len = min_t(unsigned, len,
+					 PAGE_CACHE_SIZE - pg_offset);
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+
+		addr = kmap(page);
+		memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+		pg_offset = 0;
+		len -= cur_len;
+		ret += cur_len;
+	}
+out:
+	iput(inode);
+	return ret;
+}
+
+/*
+ * Read some bytes from the current inode/file and send a write command to
+ * user space.
+ */
+static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	int ret = 0;
+	struct fs_path *p;
+	ssize_t num_read = 0;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
+
+	num_read = fill_read_buf(sctx, offset, len);
+	if (num_read <= 0) {
+		if (num_read < 0)
+			ret = num_read;
+		goto out;
+	}
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	if (ret < 0)
+		return ret;
+	return num_read;
+}
+
+/*
+ * Send a clone command to user space.
+ */
+static int send_clone(struct send_ctx *sctx,
+		      u64 offset, u32 len,
+		      struct clone_root *clone_root)
+{
+	int ret = 0;
+	struct fs_path *p;
+	u64 gen;
+
+verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
+	       "clone_inode=%llu, clone_offset=%llu\n", offset, len,
+		clone_root->root->objectid, clone_root->ino,
+		clone_root->offset);
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+
+	if (clone_root->root == sctx->send_root) {
+		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
+				&gen, NULL, NULL, NULL, NULL);
+		if (ret < 0)
+			goto out;
+		ret = get_cur_path(sctx, clone_root->ino, gen, p);
+	} else {
+		ret = get_inode_path(clone_root->root, clone_root->ino, p);
+	}
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			clone_root->root->root_item.uuid);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+		    le64_to_cpu(clone_root->root->root_item.ctransid));
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
+			clone_root->offset);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+			      u64 offset, u32 len)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+	struct fs_path *p = NULL;
+	u64 offset = sctx->cur_inode_last_extent;
+	u64 len;
+	int ret = 0;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto tlv_put_failure;
+	memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+	while (offset < end) {
+		len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+		if (ret < 0)
+			break;
+		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+		TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			break;
+		offset += len;
+	}
+tlv_put_failure:
+	fs_path_free(p);
+	return ret;
+}
+
+static int send_write_or_clone(struct send_ctx *sctx,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct clone_root *clone_root)
+{
+	int ret = 0;
+	struct btrfs_file_extent_item *ei;
+	u64 offset = key->offset;
+	u64 pos = 0;
+	u64 len;
+	u32 l;
+	u8 type;
+	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], ei);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		len = btrfs_file_extent_inline_len(path->nodes[0],
+						   path->slots[0], ei);
+		/*
+		 * it is possible the inline item won't cover the whole page,
+		 * but there may be items after this page.  Make
+		 * sure to send the whole thing
+		 */
+		len = PAGE_CACHE_ALIGN(len);
+	} else {
+		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+	}
+
+	if (offset + len > sctx->cur_inode_size)
+		len = sctx->cur_inode_size - offset;
+	if (len == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	if (clone_root && IS_ALIGNED(offset + len, bs)) {
+		ret = send_clone(sctx, offset, len, clone_root);
+	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+		ret = send_update_extent(sctx, offset, len);
+	} else {
+		while (pos < len) {
+			l = len - pos;
+			if (l > BTRFS_SEND_READ_SIZE)
+				l = BTRFS_SEND_READ_SIZE;
+			ret = send_write(sctx, pos + offset, l);
+			if (ret < 0)
+				goto out;
+			if (!ret)
+				break;
+			pos += ret;
+		}
+		ret = 0;
+	}
+out:
+	return ret;
+}
+
+static int is_extent_unchanged(struct send_ctx *sctx,
+			       struct btrfs_path *left_path,
+			       struct btrfs_key *ekey)
+{
+	int ret = 0;
+	struct btrfs_key key;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_key found_key;
+	struct btrfs_file_extent_item *ei;
+	u64 left_disknr;
+	u64 right_disknr;
+	u64 left_offset;
+	u64 right_offset;
+	u64 left_offset_fixed;
+	u64 left_len;
+	u64 right_len;
+	u64 left_gen;
+	u64 right_gen;
+	u8 left_type;
+	u8 right_type;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	eb = left_path->nodes[0];
+	slot = left_path->slots[0];
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	left_type = btrfs_file_extent_type(eb, ei);
+
+	if (left_type != BTRFS_FILE_EXTENT_REG) {
+		ret = 0;
+		goto out;
+	}
+	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+	left_len = btrfs_file_extent_num_bytes(eb, ei);
+	left_offset = btrfs_file_extent_offset(eb, ei);
+	left_gen = btrfs_file_extent_generation(eb, ei);
+
+	/*
+	 * Following comments will refer to these graphics. L is the left
+	 * extents which we are checking at the moment. 1-8 are the right
+	 * extents that we iterate.
+	 *
+	 *       |-----L-----|
+	 * |-1-|-2a-|-3-|-4-|-5-|-6-|
+	 *
+	 *       |-----L-----|
+	 * |--1--|-2b-|...(same as above)
+	 *
+	 * Alternative situation. Happens on files where extents got split.
+	 *       |-----L-----|
+	 * |-----------7-----------|-6-|
+	 *
+	 * Alternative situation. Happens on files which got larger.
+	 *       |-----L-----|
+	 * |-8-|
+	 * Nothing follows after 8.
+	 */
+
+	key.objectid = ekey->objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = ekey->offset;
+	ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * Handle special case where the right side has no extents at all.
+	 */
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(eb, &found_key, slot);
+	if (found_key.objectid != key.objectid ||
+	    found_key.type != key.type) {
+		/* If we're a hole then just pretend nothing changed */
+		ret = (left_disknr) ? 0 : 1;
+		goto out;
+	}
+
+	/*
+	 * We're now on 2a, 2b or 7.
+	 */
+	key = found_key;
+	while (key.offset < ekey->offset + left_len) {
+		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		right_type = btrfs_file_extent_type(eb, ei);
+		if (right_type != BTRFS_FILE_EXTENT_REG) {
+			ret = 0;
+			goto out;
+		}
+
+		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+		right_len = btrfs_file_extent_num_bytes(eb, ei);
+		right_offset = btrfs_file_extent_offset(eb, ei);
+		right_gen = btrfs_file_extent_generation(eb, ei);
+
+		/*
+		 * Are we at extent 8? If yes, we know the extent is changed.
+		 * This may only happen on the first iteration.
+		 */
+		if (found_key.offset + right_len <= ekey->offset) {
+			/* If we're a hole just pretend nothing changed */
+			ret = (left_disknr) ? 0 : 1;
+			goto out;
+		}
+
+		left_offset_fixed = left_offset;
+		if (key.offset < ekey->offset) {
+			/* Fix the right offset for 2a and 7. */
+			right_offset += ekey->offset - key.offset;
+		} else {
+			/* Fix the left offset for all behind 2a and 2b */
+			left_offset_fixed += key.offset - ekey->offset;
+		}
+
+		/*
+		 * Check if we have the same extent.
+		 */
+		if (left_disknr != right_disknr ||
+		    left_offset_fixed != right_offset ||
+		    left_gen != right_gen) {
+			ret = 0;
+			goto out;
+		}
+
+		/*
+		 * Go to the next extent.
+		 */
+		ret = btrfs_next_item(sctx->parent_root, path);
+		if (ret < 0)
+			goto out;
+		if (!ret) {
+			eb = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(eb, &found_key, slot);
+		}
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			key.offset += right_len;
+			break;
+		}
+		if (found_key.offset != key.offset + right_len) {
+			ret = 0;
+			goto out;
+		}
+		key = found_key;
+	}
+
+	/*
+	 * We're now behind the left extent (treat as unchanged) or at the end
+	 * of the right side (treat as changed).
+	 */
+	if (key.offset >= ekey->offset + left_len)
+		ret = 1;
+	else
+		ret = 0;
+
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+	u8 type;
+	int ret;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	sctx->cur_inode_last_extent = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+		goto out;
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+							path->slots[0], fi);
+		extent_end = ALIGN(key.offset + size,
+				   sctx->send_root->sectorsize);
+	} else {
+		extent_end = key.offset +
+			btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	}
+	sctx->cur_inode_last_extent = extent_end;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+			   struct btrfs_key *key)
+{
+	struct btrfs_file_extent_item *fi;
+	u64 extent_end;
+	u8 type;
+	int ret = 0;
+
+	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+		return 0;
+
+	if (sctx->cur_inode_last_extent == (u64)-1) {
+		ret = get_last_extent(sctx, key->offset - 1);
+		if (ret)
+			return ret;
+	}
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	type = btrfs_file_extent_type(path->nodes[0], fi);
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+							path->slots[0], fi);
+		extent_end = ALIGN(key->offset + size,
+				   sctx->send_root->sectorsize);
+	} else {
+		extent_end = key->offset +
+			btrfs_file_extent_num_bytes(path->nodes[0], fi);
+	}
+
+	if (path->slots[0] == 0 &&
+	    sctx->cur_inode_last_extent < key->offset) {
+		/*
+		 * We might have skipped entire leafs that contained only
+		 * file extent items for our current inode. These leafs have
+		 * a generation number smaller (older) than the one in the
+		 * current leaf and the leaf our last extent came from, and
+		 * are located between these 2 leafs.
+		 */
+		ret = get_last_extent(sctx, key->offset - 1);
+		if (ret)
+			return ret;
+	}
+
+	if (sctx->cur_inode_last_extent < key->offset)
+		ret = send_hole(sctx, key->offset);
+	sctx->cur_inode_last_extent = extent_end;
+	return ret;
+}
+
+static int process_extent(struct send_ctx *sctx,
+			  struct btrfs_path *path,
+			  struct btrfs_key *key)
+{
+	struct clone_root *found_clone = NULL;
+	int ret = 0;
+
+	if (S_ISLNK(sctx->cur_inode_mode))
+		return 0;
+
+	if (sctx->parent_root && !sctx->cur_inode_new) {
+		ret = is_extent_unchanged(sctx, path, key);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out_hole;
+		}
+	} else {
+		struct btrfs_file_extent_item *ei;
+		u8 type;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		type = btrfs_file_extent_type(path->nodes[0], ei);
+		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+		    type == BTRFS_FILE_EXTENT_REG) {
+			/*
+			 * The send spec does not have a prealloc command yet,
+			 * so just leave a hole for prealloc'ed extents until
+			 * we have enough commands queued up to justify rev'ing
+			 * the send spec.
+			 */
+			if (type == BTRFS_FILE_EXTENT_PREALLOC) {
+				ret = 0;
+				goto out;
+			}
+
+			/* Have a hole, just skip it. */
+			if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
+				ret = 0;
+				goto out;
+			}
+		}
+	}
+
+	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
+			sctx->cur_inode_size, &found_clone);
+	if (ret != -ENOENT && ret < 0)
+		goto out;
+
+	ret = send_write_or_clone(sctx, path, key, found_clone);
+	if (ret)
+		goto out;
+out_hole:
+	ret = maybe_send_hole(sctx, path, key);
+out:
+	return ret;
+}
+
+static int process_all_extents(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	root = sctx->send_root;
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = sctx->cmp_key->objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = process_extent(sctx, path, &found_key);
+		if (ret < 0)
+			goto out;
+
+		path->slots[0]++;
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+					   int *pending_move,
+					   int *refs_processed)
+{
+	int ret = 0;
+
+	if (sctx->cur_ino == 0)
+		goto out;
+	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
+	    sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
+		goto out;
+	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
+		goto out;
+
+	ret = process_recorded_refs(sctx, pending_move);
+	if (ret < 0)
+		goto out;
+
+	*refs_processed = 1;
+out:
+	return ret;
+}
+
+static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+{
+	int ret = 0;
+	u64 left_mode;
+	u64 left_uid;
+	u64 left_gid;
+	u64 right_mode;
+	u64 right_uid;
+	u64 right_gid;
+	int need_chmod = 0;
+	int need_chown = 0;
+	int pending_move = 0;
+	int refs_processed = 0;
+
+	ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+					      &refs_processed);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * We have processed the refs and thus need to advance send_progress.
+	 * Now, calls to get_cur_xxx will take the updated refs of the current
+	 * inode into account.
+	 *
+	 * On the other hand, if our current inode is a directory and couldn't
+	 * be moved/renamed because its parent was renamed/moved too and it has
+	 * a higher inode number, we can only move/rename our current inode
+	 * after we moved/renamed its parent. Therefore in this case operate on
+	 * the old path (pre move/rename) of our current inode, and the
+	 * move/rename will be performed later.
+	 */
+	if (refs_processed && !pending_move)
+		sctx->send_progress = sctx->cur_ino + 1;
+
+	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
+		goto out;
+	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
+		goto out;
+
+	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
+			&left_mode, &left_uid, &left_gid, NULL);
+	if (ret < 0)
+		goto out;
+
+	if (!sctx->parent_root || sctx->cur_inode_new) {
+		need_chown = 1;
+		if (!S_ISLNK(sctx->cur_inode_mode))
+			need_chmod = 1;
+	} else {
+		ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
+				NULL, NULL, &right_mode, &right_uid,
+				&right_gid, NULL);
+		if (ret < 0)
+			goto out;
+
+		if (left_uid != right_uid || left_gid != right_gid)
+			need_chown = 1;
+		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
+			need_chmod = 1;
+	}
+
+	if (S_ISREG(sctx->cur_inode_mode)) {
+		if (need_send_hole(sctx)) {
+			if (sctx->cur_inode_last_extent == (u64)-1 ||
+			    sctx->cur_inode_last_extent <
+			    sctx->cur_inode_size) {
+				ret = get_last_extent(sctx, (u64)-1);
+				if (ret)
+					goto out;
+			}
+			if (sctx->cur_inode_last_extent <
+			    sctx->cur_inode_size) {
+				ret = send_hole(sctx, sctx->cur_inode_size);
+				if (ret)
+					goto out;
+			}
+		}
+		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_inode_size);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (need_chown) {
+		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				left_uid, left_gid);
+		if (ret < 0)
+			goto out;
+	}
+	if (need_chmod) {
+		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				left_mode);
+		if (ret < 0)
+			goto out;
+	}
+
+	/*
+	 * If other directory inodes depended on our current directory
+	 * inode's move/rename, now do their move/rename operations.
+	 */
+	if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+		ret = apply_children_dir_moves(sctx);
+		if (ret)
+			goto out;
+		/*
+		 * Need to send that every time, no matter if it actually
+		 * changed between the two trees as we have done changes to
+		 * the inode before. If our inode is a directory and it's
+		 * waiting to be moved/renamed, we will send its utimes when
+		 * it's moved/renamed, therefore we don't need to do it here.
+		 */
+		sctx->send_progress = sctx->cur_ino + 1;
+		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+static int changed_inode(struct send_ctx *sctx,
+			 enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+	struct btrfs_key *key = sctx->cmp_key;
+	struct btrfs_inode_item *left_ii = NULL;
+	struct btrfs_inode_item *right_ii = NULL;
+	u64 left_gen = 0;
+	u64 right_gen = 0;
+
+	sctx->cur_ino = key->objectid;
+	sctx->cur_inode_new_gen = 0;
+	sctx->cur_inode_last_extent = (u64)-1;
+
+	/*
+	 * Set send_progress to current inode. This will tell all get_cur_xxx
+	 * functions that the current inode's refs are not updated yet. Later,
+	 * when process_recorded_refs is finished, it is set to cur_ino + 1.
+	 */
+	sctx->send_progress = sctx->cur_ino;
+
+	if (result == BTRFS_COMPARE_TREE_NEW ||
+	    result == BTRFS_COMPARE_TREE_CHANGED) {
+		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
+				sctx->left_path->slots[0],
+				struct btrfs_inode_item);
+		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
+				left_ii);
+	} else {
+		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+				sctx->right_path->slots[0],
+				struct btrfs_inode_item);
+		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+				right_ii);
+	}
+	if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+				sctx->right_path->slots[0],
+				struct btrfs_inode_item);
+
+		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+				right_ii);
+
+		/*
+		 * The cur_ino = root dir case is special here. We can't treat
+		 * the inode as deleted+reused because it would generate a
+		 * stream that tries to delete/mkdir the root dir.
+		 */
+		if (left_gen != right_gen &&
+		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+			sctx->cur_inode_new_gen = 1;
+	}
+
+	if (result == BTRFS_COMPARE_TREE_NEW) {
+		sctx->cur_inode_gen = left_gen;
+		sctx->cur_inode_new = 1;
+		sctx->cur_inode_deleted = 0;
+		sctx->cur_inode_size = btrfs_inode_size(
+				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_mode = btrfs_inode_mode(
+				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_rdev = btrfs_inode_rdev(
+				sctx->left_path->nodes[0], left_ii);
+		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+			ret = send_create_inode_if_needed(sctx);
+	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
+		sctx->cur_inode_gen = right_gen;
+		sctx->cur_inode_new = 0;
+		sctx->cur_inode_deleted = 1;
+		sctx->cur_inode_size = btrfs_inode_size(
+				sctx->right_path->nodes[0], right_ii);
+		sctx->cur_inode_mode = btrfs_inode_mode(
+				sctx->right_path->nodes[0], right_ii);
+	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		/*
+		 * We need to do some special handling in case the inode was
+		 * reported as changed with a changed generation number. This
+		 * means that the original inode was deleted and new inode
+		 * reused the same inum. So we have to treat the old inode as
+		 * deleted and the new one as new.
+		 */
+		if (sctx->cur_inode_new_gen) {
+			/*
+			 * First, process the inode as if it was deleted.
+			 */
+			sctx->cur_inode_gen = right_gen;
+			sctx->cur_inode_new = 0;
+			sctx->cur_inode_deleted = 1;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->right_path->nodes[0], right_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->right_path->nodes[0], right_ii);
+			ret = process_all_refs(sctx,
+					BTRFS_COMPARE_TREE_DELETED);
+			if (ret < 0)
+				goto out;
+
+			/*
+			 * Now process the inode as if it was new.
+			 */
+			sctx->cur_inode_gen = left_gen;
+			sctx->cur_inode_new = 1;
+			sctx->cur_inode_deleted = 0;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_rdev = btrfs_inode_rdev(
+					sctx->left_path->nodes[0], left_ii);
+			ret = send_create_inode_if_needed(sctx);
+			if (ret < 0)
+				goto out;
+
+			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+			if (ret < 0)
+				goto out;
+			/*
+			 * Advance send_progress now as we did not get into
+			 * process_recorded_refs_if_needed in the new_gen case.
+			 */
+			sctx->send_progress = sctx->cur_ino + 1;
+
+			/*
+			 * Now process all extents and xattrs of the inode as if
+			 * they were all new.
+			 */
+			ret = process_all_extents(sctx);
+			if (ret < 0)
+				goto out;
+			ret = process_all_new_xattrs(sctx);
+			if (ret < 0)
+				goto out;
+		} else {
+			sctx->cur_inode_gen = left_gen;
+			sctx->cur_inode_new = 0;
+			sctx->cur_inode_new_gen = 0;
+			sctx->cur_inode_deleted = 0;
+			sctx->cur_inode_size = btrfs_inode_size(
+					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_mode = btrfs_inode_mode(
+					sctx->left_path->nodes[0], left_ii);
+		}
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
+static int changed_ref(struct send_ctx *sctx,
+		       enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen &&
+	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		if (result == BTRFS_COMPARE_TREE_NEW)
+			ret = record_new_ref(sctx);
+		else if (result == BTRFS_COMPARE_TREE_DELETED)
+			ret = record_deleted_ref(sctx);
+		else if (result == BTRFS_COMPARE_TREE_CHANGED)
+			ret = record_changed_ref(sctx);
+	}
+
+	return ret;
+}
+
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
+static int changed_xattr(struct send_ctx *sctx,
+			 enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+		if (result == BTRFS_COMPARE_TREE_NEW)
+			ret = process_new_xattr(sctx);
+		else if (result == BTRFS_COMPARE_TREE_DELETED)
+			ret = process_deleted_xattr(sctx);
+		else if (result == BTRFS_COMPARE_TREE_CHANGED)
+			ret = process_changed_xattr(sctx);
+	}
+
+	return ret;
+}
+
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
+static int changed_extent(struct send_ctx *sctx,
+			  enum btrfs_compare_tree_result result)
+{
+	int ret = 0;
+
+	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+		if (result != BTRFS_COMPARE_TREE_DELETED)
+			ret = process_extent(sctx, sctx->left_path,
+					sctx->cmp_key);
+	}
+
+	return ret;
+}
+
+static int dir_changed(struct send_ctx *sctx, u64 dir)
+{
+	u64 orig_gen, new_gen;
+	int ret;
+
+	ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
+			     NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	return (orig_gen != new_gen) ? 1 : 0;
+}
+
+static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+			struct btrfs_key *key)
+{
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u64 dirid = 0, last_dirid = 0;
+	unsigned long ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+	int ret = 0;
+
+	/* Easy case, just check this one dirid */
+	if (key->type == BTRFS_INODE_REF_KEY) {
+		dirid = key->offset;
+
+		ret = dir_changed(sctx, dirid);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *)(ptr +
+						       cur_offset);
+		dirid = btrfs_inode_extref_parent(leaf, extref);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+		cur_offset += ref_name_len + sizeof(*extref);
+		if (dirid == last_dirid)
+			continue;
+		ret = dir_changed(sctx, dirid);
+		if (ret)
+			break;
+		last_dirid = dirid;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
+static int changed_cb(struct btrfs_root *left_root,
+		      struct btrfs_root *right_root,
+		      struct btrfs_path *left_path,
+		      struct btrfs_path *right_path,
+		      struct btrfs_key *key,
+		      enum btrfs_compare_tree_result result,
+		      void *ctx)
+{
+	int ret = 0;
+	struct send_ctx *sctx = ctx;
+
+	if (result == BTRFS_COMPARE_TREE_SAME) {
+		if (key->type == BTRFS_INODE_REF_KEY ||
+		    key->type == BTRFS_INODE_EXTREF_KEY) {
+			ret = compare_refs(sctx, left_path, key);
+			if (!ret)
+				return 0;
+			if (ret < 0)
+				return ret;
+		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+			return maybe_send_hole(sctx, left_path, key);
+		} else {
+			return 0;
+		}
+		result = BTRFS_COMPARE_TREE_CHANGED;
+		ret = 0;
+	}
+
+	sctx->left_path = left_path;
+	sctx->right_path = right_path;
+	sctx->cmp_key = key;
+
+	ret = finish_inode_if_needed(sctx, 0);
+	if (ret < 0)
+		goto out;
+
+	/* Ignore non-FS objects */
+	if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+	    key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+		goto out;
+
+	if (key->type == BTRFS_INODE_ITEM_KEY)
+		ret = changed_inode(sctx, result);
+	else if (key->type == BTRFS_INODE_REF_KEY ||
+		 key->type == BTRFS_INODE_EXTREF_KEY)
+		ret = changed_ref(sctx, result);
+	else if (key->type == BTRFS_XATTR_ITEM_KEY)
+		ret = changed_xattr(sctx, result);
+	else if (key->type == BTRFS_EXTENT_DATA_KEY)
+		ret = changed_extent(sctx, result);
+
+out:
+	return ret;
+}
+
+static int full_send_tree(struct send_ctx *sctx)
+{
+	int ret;
+	struct btrfs_root *send_root = sctx->send_root;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *eb;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out;
+	if (ret)
+		goto out_finish;
+
+	while (1) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+		ret = changed_cb(send_root, NULL, path, NULL,
+				&found_key, BTRFS_COMPARE_TREE_NEW, sctx);
+		if (ret < 0)
+			goto out;
+
+		key.objectid = found_key.objectid;
+		key.type = found_key.type;
+		key.offset = found_key.offset + 1;
+
+		ret = btrfs_next_item(send_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret  = 0;
+			break;
+		}
+	}
+
+out_finish:
+	ret = finish_inode_if_needed(sctx, 1);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int send_subvol(struct send_ctx *sctx)
+{
+	int ret;
+
+	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
+		ret = send_header(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = send_subvol_begin(sctx);
+	if (ret < 0)
+		goto out;
+
+	if (sctx->parent_root) {
+		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
+				changed_cb, sctx);
+		if (ret < 0)
+			goto out;
+		ret = finish_inode_if_needed(sctx, 1);
+		if (ret < 0)
+			goto out;
+	} else {
+		ret = full_send_tree(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	free_recorded_refs(sctx);
+	return ret;
+}
+
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+	spin_lock(&root->root_item_lock);
+	root->send_in_progress--;
+	/*
+	 * Not much left to do, we don't know why it's unbalanced and
+	 * can't blindly reset it to 0.
+	 */
+	if (root->send_in_progress < 0)
+		btrfs_err(root->fs_info,
+			"send_in_progres unbalanced %d root %llu",
+			root->send_in_progress, root->root_key.objectid);
+	spin_unlock(&root->root_item_lock);
+}
+
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
+{
+	int ret = 0;
+	struct btrfs_root *send_root;
+	struct btrfs_root *clone_root;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_ioctl_send_args *arg = NULL;
+	struct btrfs_key key;
+	struct send_ctx *sctx = NULL;
+	u32 i;
+	u64 *clone_sources_tmp = NULL;
+	int clone_sources_to_rollback = 0;
+	int sort_clone_roots = 0;
+	int index;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	send_root = BTRFS_I(file_inode(mnt_file))->root;
+	fs_info = send_root->fs_info;
+
+	/*
+	 * The subvolume must remain read-only during send, protect against
+	 * making it RW. This also protects against deletion.
+	 */
+	spin_lock(&send_root->root_item_lock);
+	send_root->send_in_progress++;
+	spin_unlock(&send_root->root_item_lock);
+
+	/*
+	 * This is done when we lookup the root, it should already be complete
+	 * by the time we get here.
+	 */
+	WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+	/*
+	 * Userspace tools do the checks and warn the user if it's
+	 * not RO.
+	 */
+	if (!btrfs_root_readonly(send_root)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	arg = memdup_user(arg_, sizeof(*arg));
+	if (IS_ERR(arg)) {
+		ret = PTR_ERR(arg);
+		arg = NULL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_READ, arg->clone_sources,
+			sizeof(*arg->clone_sources) *
+			arg->clone_sources_count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
+	if (!sctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&sctx->new_refs);
+	INIT_LIST_HEAD(&sctx->deleted_refs);
+	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
+	INIT_LIST_HEAD(&sctx->name_cache_list);
+
+	sctx->flags = arg->flags;
+
+	sctx->send_filp = fget(arg->send_fd);
+	if (!sctx->send_filp) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	sctx->send_root = send_root;
+	/*
+	 * Unlikely but possible, if the subvolume is marked for deletion but
+	 * is slow to remove the directory entry, send can still be started
+	 */
+	if (btrfs_root_dead(sctx->send_root)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	sctx->clone_roots_cnt = arg->clone_sources_count;
+
+	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
+	sctx->send_buf = vmalloc(sctx->send_max_size);
+	if (!sctx->send_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+	if (!sctx->read_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sctx->pending_dir_moves = RB_ROOT;
+	sctx->waiting_dir_moves = RB_ROOT;
+	sctx->orphan_dirs = RB_ROOT;
+
+	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
+			(arg->clone_sources_count + 1));
+	if (!sctx->clone_roots) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (arg->clone_sources_count) {
+		clone_sources_tmp = vmalloc(arg->clone_sources_count *
+				sizeof(*arg->clone_sources));
+		if (!clone_sources_tmp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
+				arg->clone_sources_count *
+				sizeof(*arg->clone_sources));
+		if (ret) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		for (i = 0; i < arg->clone_sources_count; i++) {
+			key.objectid = clone_sources_tmp[i];
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.offset = (u64)-1;
+
+			index = srcu_read_lock(&fs_info->subvol_srcu);
+
+			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+			if (IS_ERR(clone_root)) {
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
+				ret = PTR_ERR(clone_root);
+				goto out;
+			}
+			clone_sources_to_rollback = i + 1;
+			spin_lock(&clone_root->root_item_lock);
+			clone_root->send_in_progress++;
+			if (!btrfs_root_readonly(clone_root)) {
+				spin_unlock(&clone_root->root_item_lock);
+				srcu_read_unlock(&fs_info->subvol_srcu, index);
+				ret = -EPERM;
+				goto out;
+			}
+			spin_unlock(&clone_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+			sctx->clone_roots[i].root = clone_root;
+		}
+		vfree(clone_sources_tmp);
+		clone_sources_tmp = NULL;
+	}
+
+	if (arg->parent_root) {
+		key.objectid = arg->parent_root;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+
+		index = srcu_read_lock(&fs_info->subvol_srcu);
+
+		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(sctx->parent_root)) {
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+			ret = PTR_ERR(sctx->parent_root);
+			goto out;
+		}
+
+		spin_lock(&sctx->parent_root->root_item_lock);
+		sctx->parent_root->send_in_progress++;
+		if (!btrfs_root_readonly(sctx->parent_root) ||
+				btrfs_root_dead(sctx->parent_root)) {
+			spin_unlock(&sctx->parent_root->root_item_lock);
+			srcu_read_unlock(&fs_info->subvol_srcu, index);
+			ret = -EPERM;
+			goto out;
+		}
+		spin_unlock(&sctx->parent_root->root_item_lock);
+
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+	}
+
+	/*
+	 * Clones from send_root are allowed, but only if the clone source
+	 * is behind the current send position. This is checked while searching
+	 * for possible clone sources.
+	 */
+	sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+
+	/* We do a bsearch later */
+	sort(sctx->clone_roots, sctx->clone_roots_cnt,
+			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
+			NULL);
+	sort_clone_roots = 1;
+
+	current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
+	ret = send_subvol(sctx);
+	current->journal_info = NULL;
+	if (ret < 0)
+		goto out;
+
+	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+		if (ret < 0)
+			goto out;
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+		struct rb_node *n;
+		struct pending_dir_move *pm;
+
+		n = rb_first(&sctx->pending_dir_moves);
+		pm = rb_entry(n, struct pending_dir_move, node);
+		while (!list_empty(&pm->list)) {
+			struct pending_dir_move *pm2;
+
+			pm2 = list_first_entry(&pm->list,
+					       struct pending_dir_move, list);
+			free_pending_move(sctx, pm2);
+		}
+		free_pending_move(sctx, pm);
+	}
+
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+		struct rb_node *n;
+		struct waiting_dir_move *dm;
+
+		n = rb_first(&sctx->waiting_dir_moves);
+		dm = rb_entry(n, struct waiting_dir_move, node);
+		rb_erase(&dm->node, &sctx->waiting_dir_moves);
+		kfree(dm);
+	}
+
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+		struct rb_node *n;
+		struct orphan_dir_info *odi;
+
+		n = rb_first(&sctx->orphan_dirs);
+		odi = rb_entry(n, struct orphan_dir_info, node);
+		free_orphan_dir_info(sctx, odi);
+	}
+
+	if (sort_clone_roots) {
+		for (i = 0; i < sctx->clone_roots_cnt; i++)
+			btrfs_root_dec_send_in_progress(
+					sctx->clone_roots[i].root);
+	} else {
+		for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+			btrfs_root_dec_send_in_progress(
+					sctx->clone_roots[i].root);
+
+		btrfs_root_dec_send_in_progress(send_root);
+	}
+	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+		btrfs_root_dec_send_in_progress(sctx->parent_root);
+
+	kfree(arg);
+	vfree(clone_sources_tmp);
+
+	if (sctx) {
+		if (sctx->send_filp)
+			fput(sctx->send_filp);
+
+		vfree(sctx->clone_roots);
+		vfree(sctx->send_buf);
+		vfree(sctx->read_buf);
+
+		name_cache_free(sctx);
+
+		kfree(sctx);
+	}
+
+	return ret;
+}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
new file mode 100644
index 00000000000..48d425aef05
--- /dev/null
+++ b/fs/btrfs/send.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2012 Alexander Block.  All rights reserved.
+ * Copyright (C) 2012 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+
+#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
+#define BTRFS_SEND_STREAM_VERSION 1
+
+#define BTRFS_SEND_BUF_SIZE (1024 * 64)
+#define BTRFS_SEND_READ_SIZE (1024 * 48)
+
+enum btrfs_tlv_type {
+	BTRFS_TLV_U8,
+	BTRFS_TLV_U16,
+	BTRFS_TLV_U32,
+	BTRFS_TLV_U64,
+	BTRFS_TLV_BINARY,
+	BTRFS_TLV_STRING,
+	BTRFS_TLV_UUID,
+	BTRFS_TLV_TIMESPEC,
+};
+
+struct btrfs_stream_header {
+	char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
+	__le32 version;
+} __attribute__ ((__packed__));
+
+struct btrfs_cmd_header {
+	/* len excluding the header */
+	__le32 len;
+	__le16 cmd;
+	/* crc including the header with zero crc field */
+	__le32 crc;
+} __attribute__ ((__packed__));
+
+struct btrfs_tlv_header {
+	__le16 tlv_type;
+	/* len excluding the header */
+	__le16 tlv_len;
+} __attribute__ ((__packed__));
+
+/* commands */
+enum btrfs_send_cmd {
+	BTRFS_SEND_C_UNSPEC,
+
+	BTRFS_SEND_C_SUBVOL,
+	BTRFS_SEND_C_SNAPSHOT,
+
+	BTRFS_SEND_C_MKFILE,
+	BTRFS_SEND_C_MKDIR,
+	BTRFS_SEND_C_MKNOD,
+	BTRFS_SEND_C_MKFIFO,
+	BTRFS_SEND_C_MKSOCK,
+	BTRFS_SEND_C_SYMLINK,
+
+	BTRFS_SEND_C_RENAME,
+	BTRFS_SEND_C_LINK,
+	BTRFS_SEND_C_UNLINK,
+	BTRFS_SEND_C_RMDIR,
+
+	BTRFS_SEND_C_SET_XATTR,
+	BTRFS_SEND_C_REMOVE_XATTR,
+
+	BTRFS_SEND_C_WRITE,
+	BTRFS_SEND_C_CLONE,
+
+	BTRFS_SEND_C_TRUNCATE,
+	BTRFS_SEND_C_CHMOD,
+	BTRFS_SEND_C_CHOWN,
+	BTRFS_SEND_C_UTIMES,
+
+	BTRFS_SEND_C_END,
+	BTRFS_SEND_C_UPDATE_EXTENT,
+	__BTRFS_SEND_C_MAX,
+};
+#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
+
+/* attributes in send stream */
+enum {
+	BTRFS_SEND_A_UNSPEC,
+
+	BTRFS_SEND_A_UUID,
+	BTRFS_SEND_A_CTRANSID,
+
+	BTRFS_SEND_A_INO,
+	BTRFS_SEND_A_SIZE,
+	BTRFS_SEND_A_MODE,
+	BTRFS_SEND_A_UID,
+	BTRFS_SEND_A_GID,
+	BTRFS_SEND_A_RDEV,
+	BTRFS_SEND_A_CTIME,
+	BTRFS_SEND_A_MTIME,
+	BTRFS_SEND_A_ATIME,
+	BTRFS_SEND_A_OTIME,
+
+	BTRFS_SEND_A_XATTR_NAME,
+	BTRFS_SEND_A_XATTR_DATA,
+
+	BTRFS_SEND_A_PATH,
+	BTRFS_SEND_A_PATH_TO,
+	BTRFS_SEND_A_PATH_LINK,
+
+	BTRFS_SEND_A_FILE_OFFSET,
+	BTRFS_SEND_A_DATA,
+
+	BTRFS_SEND_A_CLONE_UUID,
+	BTRFS_SEND_A_CLONE_CTRANSID,
+	BTRFS_SEND_A_CLONE_PATH,
+	BTRFS_SEND_A_CLONE_OFFSET,
+	BTRFS_SEND_A_CLONE_LEN,
+
+	__BTRFS_SEND_A_MAX,
+};
+#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
+
+#ifdef __KERNEL__
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+#endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c0f7ecaf1e7..b976597b072 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,15 +17,27 @@
  */
 
 #include <linux/highmem.h>
+#include <asm/unaligned.h>
 
-/* this is some deeply nasty code.  ctree.h has a different
- * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+#include "ctree.h"
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+       return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+       *(u8 *)p = val;
+}
+
+/*
+ * this is some deeply nasty code.
  *
  * The end result is that anyone who #includes ctree.h gets a
- * declaration for the btrfs_set_foo functions and btrfs_foo functions
- *
- * This file declares the macros and then #includes ctree.h, which results
- * in cpp creating the function here based on the template below.
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions,
+ * which are wappers of btrfs_set_token_#bits functions and
+ * btrfs_get_token_#bits functions, which are defined in this file.
  *
  * These setget functions do all the extent_buffer related mapping
  * required to efficiently read and write specific fields in the extent
@@ -33,107 +45,98 @@
  * an unsigned long offset into the extent buffer which has been
  * cast to a specific type.  This gives us all the gcc type checking.
  *
- * The extent buffer api is used to do all the kmapping and page
- * spanning work required to get extent buffers in highmem and have
- * a metadata blocksize different from the page size.
- *
- * The macro starts with a simple function prototype declaration so that
- * sparse won't complain about it being static.
+ * The extent buffer api is used to do the page spanning work required to
+ * have a metadata blocksize different from the page size.
  */
 
-#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
-void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);	\
-u##bits btrfs_##name(struct extent_buffer *eb,				\
-				   type *s)				\
+#define DEFINE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			       unsigned long off,			\
+			       struct btrfs_map_token *token)		\
 {									\
-	unsigned long part_offset = (unsigned long)s;			\
-	unsigned long offset = part_offset + offsetof(type, member);	\
-	type *p;							\
-	/* ugly, but we want the fast path here */			\
-	if (eb->map_token && offset >= eb->map_start &&			\
-	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
-	    eb->map_len) {						\
-		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
-		return le##bits##_to_cpu(p->member);			\
-	}								\
-	{								\
-		int err;						\
-		char *map_token;					\
-		char *kaddr;						\
-		int unmap_on_exit = (eb->map_token == NULL);		\
-		unsigned long map_start;				\
-		unsigned long map_len;					\
-		u##bits res;						\
-		err = map_extent_buffer(eb, offset,			\
-				sizeof(((type *)0)->member),		\
-				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER1);	\
-		if (err) {						\
-			__le##bits leres;				\
-			read_eb_member(eb, s, type, member, &leres);	\
-			return le##bits##_to_cpu(leres);		\
-		}							\
-		p = (type *)(kaddr + part_offset - map_start);		\
-		res = le##bits##_to_cpu(p->member);			\
-		if (unmap_on_exit)					\
-			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+	u##bits res;							\
+									\
+	if (token && token->kaddr && token->offset <= offset &&		\
+	    token->eb == eb &&						\
+	   (token->offset + PAGE_CACHE_SIZE >= offset + size)) {	\
+		kaddr = token->kaddr;					\
+		p = kaddr + part_offset - token->offset;		\
+		res = get_unaligned_le##bits(p + off);			\
 		return res;						\
 	}								\
+	err = map_private_extent_buffer(eb, offset, size,		\
+					&kaddr, &map_start, &map_len);	\
+	if (err) {							\
+		__le##bits leres;					\
+									\
+		read_extent_buffer(eb, &leres, offset, size);		\
+		return le##bits##_to_cpu(leres);			\
+	}								\
+	p = kaddr + part_offset - map_start;				\
+	res = get_unaligned_le##bits(p + off);				\
+	if (token) {							\
+		token->kaddr = kaddr;					\
+		token->offset = map_start;				\
+		token->eb = eb;						\
+	}								\
+	return res;							\
 }									\
-void btrfs_set_##name(struct extent_buffer *eb,				\
-				    type *s, u##bits val)		\
+void btrfs_set_token_##bits(struct extent_buffer *eb,			\
+			    void *ptr, unsigned long off, u##bits val,	\
+			    struct btrfs_map_token *token)		\
 {									\
-	unsigned long part_offset = (unsigned long)s;			\
-	unsigned long offset = part_offset + offsetof(type, member);	\
-	type *p;							\
-	/* ugly, but we want the fast path here */			\
-	if (eb->map_token && offset >= eb->map_start &&			\
-	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
-	    eb->map_len) {						\
-		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
-		p->member = cpu_to_le##bits(val);			\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+									\
+	if (token && token->kaddr && token->offset <= offset &&		\
+	    token->eb == eb &&						\
+	   (token->offset + PAGE_CACHE_SIZE >= offset + size)) {	\
+		kaddr = token->kaddr;					\
+		p = kaddr + part_offset - token->offset;		\
+		put_unaligned_le##bits(val, p + off);			\
+		return;							\
+	}								\
+	err = map_private_extent_buffer(eb, offset, size,		\
+			&kaddr, &map_start, &map_len);			\
+	if (err) {							\
+		__le##bits val2;					\
+									\
+		val2 = cpu_to_le##bits(val);				\
+		write_extent_buffer(eb, &val2, offset, size);		\
 		return;							\
 	}								\
-	{								\
-		int err;						\
-		char *map_token;					\
-		char *kaddr;						\
-		int unmap_on_exit = (eb->map_token == NULL);		\
-		unsigned long map_start;				\
-		unsigned long map_len;					\
-		err = map_extent_buffer(eb, offset,			\
-				sizeof(((type *)0)->member),		\
-				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER1);	\
-		if (err) {						\
-			__le##bits val2;				\
-			val2 = cpu_to_le##bits(val);			\
-			write_eb_member(eb, s, type, member, &val2);	\
-			return;						\
-		}							\
-		p = (type *)(kaddr + part_offset - map_start);		\
-		p->member = cpu_to_le##bits(val);			\
-		if (unmap_on_exit)					\
-			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+	p = kaddr + part_offset - map_start;				\
+	put_unaligned_le##bits(val, p + off);				\
+	if (token) {							\
+		token->kaddr = kaddr;					\
+		token->offset = map_start;				\
+		token->eb = eb;						\
 	}								\
 }
 
-#include "ctree.h"
+DEFINE_BTRFS_SETGET_BITS(8)
+DEFINE_BTRFS_SETGET_BITS(16)
+DEFINE_BTRFS_SETGET_BITS(32)
+DEFINE_BTRFS_SETGET_BITS(64)
 
 void btrfs_node_key(struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr)
 {
 	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
-	if (eb->map_token && ptr >= eb->map_start &&
-	    ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
-		memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
-			sizeof(*disk_key));
-		return;
-	} else if (eb->map_token) {
-		unmap_extent_buffer(eb, eb->map_token, KM_USER1);
-		eb->map_token = NULL;
-	}
 	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
 		       struct btrfs_key_ptr, key, disk_key);
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d39a9895d93..8e16bca69c5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,25 +39,38 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include "compat.h"
+#include <linux/cleancache.h>
+#include <linux/ratelimit.h>
+#include <linux/btrfs.h>
+#include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
+#include "hash.h"
+#include "props.h"
 #include "xattr.h"
 #include "volumes.h"
-#include "version.h"
 #include "export.h"
 #include "compression.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "free-space-cache.h"
+#include "backref.h"
+#include "tests/btrfs-tests.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
 
 static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
-				      char nbuf[16])
+static const char *btrfs_decode_error(int errno)
 {
-	char *errstr = NULL;
+	char *errstr = "unknown";
 
 	switch (errno) {
 	case -EIO:
@@ -69,33 +82,27 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
 	case -EROFS:
 		errstr = "Readonly filesystem";
 		break;
-	default:
-		if (nbuf) {
-			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
-				errstr = nbuf;
-		}
+	case -EEXIST:
+		errstr = "Object already exists";
+		break;
+	case -ENOSPC:
+		errstr = "No space left";
+		break;
+	case -ENOENT:
+		errstr = "No such entry";
 		break;
 	}
 
 	return errstr;
 }
 
-static void __save_error_info(struct btrfs_fs_info *fs_info)
+static void save_error_info(struct btrfs_fs_info *fs_info)
 {
 	/*
 	 * today we only save the error info into ram.  Long term we'll
 	 * also send it down to the disk
 	 */
-	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
-}
-
-/* NOTE:
- *	We move write_super stuff at umount in order to avoid deadlock
- *	for umount hold all lock.
- */
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
-	__save_error_info(fs_info);
+	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 }
 
 /* btrfs handle error by forcing the filesystem readonly */
@@ -106,21 +113,31 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
 		sb->s_flags |= MS_RDONLY;
-		printk(KERN_INFO "btrfs is forced readonly\n");
+		btrfs_info(fs_info, "forced readonly");
+		/*
+		 * Note that a running device replace operation is not
+		 * canceled here although there is no way to update
+		 * the progress. It would add the risk of a deadlock,
+		 * therefore the canceling is ommited. The only penalty
+		 * is that some I/O remains active until the procedure
+		 * completes. The next time when the filesystem is
+		 * mounted writeable again, the device replace
+		 * operation continues.
+		 */
 	}
 }
 
+#ifdef CONFIG_PRINTK
 /*
  * __btrfs_std_error decodes expected errors from the caller and
  * invokes the approciate error response.
  */
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno)
+		       unsigned int line, int errno, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
-	char nbuf[16];
 	const char *errstr;
 
 	/*
@@ -128,25 +145,175 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 	 * under MS_RDONLY, then it is safe here.
 	 */
 	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
-		return;
+  		return;
+
+	errstr = btrfs_decode_error(errno);
+	if (fmt) {
+		struct va_format vaf;
+		va_list args;
+
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_CRIT
+			"BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+			sb->s_id, function, line, errno, errstr, &vaf);
+		va_end(args);
+	} else {
+		printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
+			sb->s_id, function, line, errno, errstr);
+	}
 
-	errstr = btrfs_decode_error(fs_info, errno, nbuf);
-	printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
-		sb->s_id, function, line, errstr);
+	/* Don't go through full error handling during mount */
 	save_error_info(fs_info);
+	if (sb->s_flags & MS_BORN)
+		btrfs_handle_error(fs_info);
+}
+
+static const char * const logtypes[] = {
+	"emergency",
+	"alert",
+	"critical",
+	"error",
+	"warning",
+	"notice",
+	"info",
+	"debug",
+};
 
-	btrfs_handle_error(fs_info);
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+	struct super_block *sb = fs_info->sb;
+	char lvl[4];
+	struct va_format vaf;
+	va_list args;
+	const char *type = logtypes[4];
+	int kern_level;
+
+	va_start(args, fmt);
+
+	kern_level = printk_get_level(fmt);
+	if (kern_level) {
+		size_t size = printk_skip_level(fmt) - fmt;
+		memcpy(lvl, fmt,  size);
+		lvl[size] = '\0';
+		fmt += size;
+		type = logtypes[kern_level - '0'];
+	} else
+		*lvl = '\0';
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+
+	va_end(args);
 }
 
-static void btrfs_put_super(struct super_block *sb)
+#else
+
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		       unsigned int line, int errno, const char *fmt, ...)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
+	struct super_block *sb = fs_info->sb;
 
-	ret = close_ctree(root);
-	sb->s_fs_info = NULL;
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+		return;
 
-	(void)ret; /* FIXME: need to fix VFS to return error? */
+	/* Don't go through full error handling during mount */
+	if (sb->s_flags & MS_BORN) {
+		save_error_info(fs_info);
+		btrfs_handle_error(fs_info);
+	}
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, const char *function,
+			       unsigned int line, int errno)
+{
+	/*
+	 * Report first abort since mount
+	 */
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
+				&root->fs_info->fs_state)) {
+		WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
+				errno);
+	}
+	trans->aborted = errno;
+	/* Nothing used. The other threads that have joined this
+	 * transaction may be able to continue. */
+	if (!trans->blocks_used) {
+		const char *errstr;
+
+		errstr = btrfs_decode_error(errno);
+		btrfs_warn(root->fs_info,
+		           "%s:%d: Aborting unused transaction(%s).",
+		           function, line, errstr);
+		return;
+	}
+	ACCESS_ONCE(trans->transaction->aborted) = errno;
+	/* Wake up anybody who may be waiting on this transaction */
+	wake_up(&root->fs_info->transaction_wait);
+	wake_up(&root->fs_info->transaction_blocked_wait);
+	__btrfs_std_error(root->fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...)
+{
+	char *s_id = "<unknown>";
+	const char *errstr;
+	struct va_format vaf = { .fmt = fmt };
+	va_list args;
+
+	if (fs_info)
+		s_id = fs_info->sb->s_id;
+
+	va_start(args, fmt);
+	vaf.va = &args;
+
+	errstr = btrfs_decode_error(errno);
+	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+			s_id, function, line, &vaf, errno, errstr);
+
+	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+		   function, line, &vaf, errno, errstr);
+	va_end(args);
+	/* Caller calls BUG() */
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+	(void)close_ctree(btrfs_sb(sb)->tree_root);
+	/* FIXME: need to fix VFS to return error? */
+	/* AV: return it _where_?  ->put_super() can be triggered by any number
+	 * of async events, up to and including delivery of SIGKILL to the
+	 * last process that kept it busy.  Or segfault in the aforementioned
+	 * process...  Whom would you report that to?
+	 */
 }
 
 enum {
@@ -156,17 +323,27 @@ enum {
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_err,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+	Opt_check_integrity, Opt_check_integrity_including_extent_data,
+	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
+	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
+	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
+	Opt_datasum, Opt_treelog, Opt_noinode_cache,
+	Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
-	{Opt_subvolid, "subvolid=%d"},
+	{Opt_subvolid, "subvolid=%s"},
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
+	{Opt_datasum, "datasum"},
 	{Opt_nodatacow, "nodatacow"},
+	{Opt_datacow, "datacow"},
 	{Opt_nobarrier, "nobarrier"},
+	{Opt_barrier, "barrier"},
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
@@ -177,34 +354,60 @@ static match_table_t tokens = {
 	{Opt_ssd, "ssd"},
 	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
+	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
+	{Opt_treelog, "treelog"},
 	{Opt_flushoncommit, "flushoncommit"},
+	{Opt_noflushoncommit, "noflushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_space_cache, "space_cache"},
 	{Opt_clear_cache, "clear_cache"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
 	{Opt_enospc_debug, "enospc_debug"},
+	{Opt_noenospc_debug, "noenospc_debug"},
+	{Opt_subvolrootid, "subvolrootid=%d"},
+	{Opt_defrag, "autodefrag"},
+	{Opt_nodefrag, "noautodefrag"},
+	{Opt_inode_cache, "inode_cache"},
+	{Opt_noinode_cache, "noinode_cache"},
+	{Opt_no_space_cache, "nospace_cache"},
+	{Opt_recovery, "recovery"},
+	{Opt_skip_balance, "skip_balance"},
+	{Opt_check_integrity, "check_int"},
+	{Opt_check_integrity_including_extent_data, "check_int_data"},
+	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+	{Opt_fatal_errors, "fatal_errors=%s"},
+	{Opt_commit_interval, "commit=%d"},
 	{Opt_err, NULL},
 };
 
 /*
  * Regular mount options parser.  Everything that is needed only when
  * reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
  */
 int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
-	char *p, *num, *orig;
+	char *p, *num, *orig = NULL;
+	u64 cache_gen;
 	int intarg;
 	int ret = 0;
 	char *compress_type;
 	bool compress_force = false;
+	bool compress = false;
+
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+	if (cache_gen)
+		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 
 	if (!options)
-		return 0;
+		goto out;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -224,11 +427,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_degraded:
-			printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+			btrfs_info(root->fs_info, "allowing degraded mounts");
 			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
 		case Opt_subvolid:
+		case Opt_subvolrootid:
 		case Opt_device:
 			/*
 			 * These are parsed by btrfs_parse_early_options
@@ -236,69 +440,112 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			 */
 			break;
 		case Opt_nodatasum:
-			printk(KERN_INFO "btrfs: setting nodatasum\n");
-			btrfs_set_opt(info->mount_opt, NODATASUM);
+			btrfs_set_and_info(root, NODATASUM,
+					   "setting nodatasum");
+			break;
+		case Opt_datasum:
+			if (btrfs_test_opt(root, NODATASUM)) {
+				if (btrfs_test_opt(root, NODATACOW))
+					btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+				else
+					btrfs_info(root->fs_info, "setting datasum");
+			}
+			btrfs_clear_opt(info->mount_opt, NODATACOW);
+			btrfs_clear_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			if (!btrfs_test_opt(root, NODATACOW)) {
+				if (!btrfs_test_opt(root, COMPRESS) ||
+				    !btrfs_test_opt(root, FORCE_COMPRESS)) {
+					btrfs_info(root->fs_info,
+						   "setting nodatacow, compression disabled");
+				} else {
+					btrfs_info(root->fs_info, "setting nodatacow");
+				}
+			}
+			btrfs_clear_opt(info->mount_opt, COMPRESS);
+			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
+		case Opt_datacow:
+			btrfs_clear_and_info(root, NODATACOW,
+					     "setting datacow");
+			break;
 		case Opt_compress_force:
 		case Opt_compress_force_type:
 			compress_force = true;
+			/* Fallthrough */
 		case Opt_compress:
 		case Opt_compress_type:
+			compress = true;
 			if (token == Opt_compress ||
 			    token == Opt_compress_force ||
 			    strcmp(args[0].from, "zlib") == 0) {
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
 			} else if (strcmp(args[0].from, "lzo") == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				btrfs_set_fs_incompat(info, COMPRESS_LZO);
+			} else if (strncmp(args[0].from, "no", 2) == 0) {
+				compress_type = "no";
+				btrfs_clear_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+				compress_force = false;
 			} else {
 				ret = -EINVAL;
 				goto out;
 			}
 
-			btrfs_set_opt(info->mount_opt, COMPRESS);
 			if (compress_force) {
-				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
-				pr_info("btrfs: force %s compression\n",
-					compress_type);
-			} else
-				pr_info("btrfs: use %s compression\n",
-					compress_type);
+				btrfs_set_and_info(root, FORCE_COMPRESS,
+						   "force %s compression",
+						   compress_type);
+			} else if (compress) {
+				if (!btrfs_test_opt(root, COMPRESS))
+					btrfs_info(root->fs_info,
+						   "btrfs: use %s compression",
+						   compress_type);
+			}
 			break;
 		case Opt_ssd:
-			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
-			btrfs_set_opt(info->mount_opt, SSD);
+			btrfs_set_and_info(root, SSD,
+					   "use ssd allocation scheme");
 			break;
 		case Opt_ssd_spread:
-			printk(KERN_INFO "btrfs: use spread ssd "
-			       "allocation scheme\n");
+			btrfs_set_and_info(root, SSD_SPREAD,
+					   "use spread ssd allocation scheme");
 			btrfs_set_opt(info->mount_opt, SSD);
-			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
 			break;
 		case Opt_nossd:
-			printk(KERN_INFO "btrfs: not using ssd allocation "
-			       "scheme\n");
-			btrfs_set_opt(info->mount_opt, NOSSD);
+			btrfs_set_and_info(root, NOSSD,
+					     "not using ssd allocation scheme");
 			btrfs_clear_opt(info->mount_opt, SSD);
-			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+			break;
+		case Opt_barrier:
+			btrfs_clear_and_info(root, NOBARRIER,
+					     "turning on barriers");
 			break;
 		case Opt_nobarrier:
-			printk(KERN_INFO "btrfs: turning off barriers\n");
-			btrfs_set_opt(info->mount_opt, NOBARRIER);
+			btrfs_set_and_info(root, NOBARRIER,
+					   "turning off barriers");
 			break;
 		case Opt_thread_pool:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg > 0) {
 				info->thread_pool_size = intarg;
-				printk(KERN_INFO "btrfs: thread pool %d\n",
-				       info->thread_pool_size);
+			} else {
+				ret = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_max_inline:
@@ -308,54 +555,103 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				kfree(num);
 
 				if (info->max_inline) {
-					info->max_inline = max_t(u64,
+					info->max_inline = min_t(u64,
 						info->max_inline,
 						root->sectorsize);
 				}
-				printk(KERN_INFO "btrfs: max_inline at %llu\n",
-					(unsigned long long)info->max_inline);
+				btrfs_info(root->fs_info, "max_inline at %llu",
+					info->max_inline);
+			} else {
+				ret = -ENOMEM;
+				goto out;
 			}
 			break;
 		case Opt_alloc_start:
 			num = match_strdup(&args[0]);
 			if (num) {
+				mutex_lock(&info->chunk_mutex);
 				info->alloc_start = memparse(num, NULL);
+				mutex_unlock(&info->chunk_mutex);
 				kfree(num);
-				printk(KERN_INFO
-					"btrfs: allocations start at %llu\n",
-					(unsigned long long)info->alloc_start);
+				btrfs_info(root->fs_info, "allocations start at %llu",
+					info->alloc_start);
+			} else {
+				ret = -ENOMEM;
+				goto out;
 			}
 			break;
+		case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+			root->fs_info->sb->s_flags |= MS_POSIXACL;
+			break;
+#else
+			btrfs_err(root->fs_info,
+				"support for ACL not compiled in!");
+			ret = -EINVAL;
+			goto out;
+#endif
 		case Opt_noacl:
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
 		case Opt_notreelog:
-			printk(KERN_INFO "btrfs: disabling tree log\n");
-			btrfs_set_opt(info->mount_opt, NOTREELOG);
+			btrfs_set_and_info(root, NOTREELOG,
+					   "disabling tree log");
+			break;
+		case Opt_treelog:
+			btrfs_clear_and_info(root, NOTREELOG,
+					     "enabling tree log");
 			break;
 		case Opt_flushoncommit:
-			printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
-			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+			btrfs_set_and_info(root, FLUSHONCOMMIT,
+					   "turning on flush-on-commit");
+			break;
+		case Opt_noflushoncommit:
+			btrfs_clear_and_info(root, FLUSHONCOMMIT,
+					     "turning off flush-on-commit");
 			break;
 		case Opt_ratio:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
 				info->metadata_ratio = intarg;
-				printk(KERN_INFO "btrfs: metadata ratio %d\n",
+				btrfs_info(root->fs_info, "metadata ratio %d",
 				       info->metadata_ratio);
+			} else {
+				ret = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_discard:
-			btrfs_set_opt(info->mount_opt, DISCARD);
+			btrfs_set_and_info(root, DISCARD,
+					   "turning on discard");
+			break;
+		case Opt_nodiscard:
+			btrfs_clear_and_info(root, DISCARD,
+					     "turning off discard");
 			break;
 		case Opt_space_cache:
-			printk(KERN_INFO "btrfs: enabling disk space caching\n");
-			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+			btrfs_set_and_info(root, SPACE_CACHE,
+					   "enabling disk space caching");
+			break;
+		case Opt_rescan_uuid_tree:
+			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
+			break;
+		case Opt_no_space_cache:
+			btrfs_clear_and_info(root, SPACE_CACHE,
+					     "disabling disk space caching");
+			break;
+		case Opt_inode_cache:
+			btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+					   "enabling inode map caching");
+			break;
+		case Opt_noinode_cache:
+			btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+					     "disabling inode map caching");
 			break;
 		case Opt_clear_cache:
-			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
-			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+			btrfs_set_and_info(root, CLEAR_CACHE,
+					   "force clearing of disk cache");
 			break;
 		case Opt_user_subvol_rm_allowed:
 			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
@@ -363,9 +659,92 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_enospc_debug:
 			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
+		case Opt_noenospc_debug:
+			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
+		case Opt_defrag:
+			btrfs_set_and_info(root, AUTO_DEFRAG,
+					   "enabling auto defrag");
+			break;
+		case Opt_nodefrag:
+			btrfs_clear_and_info(root, AUTO_DEFRAG,
+					     "disabling auto defrag");
+			break;
+		case Opt_recovery:
+			btrfs_info(root->fs_info, "enabling auto recovery");
+			btrfs_set_opt(info->mount_opt, RECOVERY);
+			break;
+		case Opt_skip_balance:
+			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+			break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+		case Opt_check_integrity_including_extent_data:
+			btrfs_info(root->fs_info,
+				   "enabling check integrity including extent data");
+			btrfs_set_opt(info->mount_opt,
+				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity:
+			btrfs_info(root->fs_info, "enabling check integrity");
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity_print_mask:
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
+				info->check_integrity_print_mask = intarg;
+				btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
+				       info->check_integrity_print_mask);
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+#else
+		case Opt_check_integrity_including_extent_data:
+		case Opt_check_integrity:
+		case Opt_check_integrity_print_mask:
+			btrfs_err(root->fs_info,
+				"support for check_integrity* not compiled in!");
+			ret = -EINVAL;
+			goto out;
+#endif
+		case Opt_fatal_errors:
+			if (strcmp(args[0].from, "panic") == 0)
+				btrfs_set_opt(info->mount_opt,
+					      PANIC_ON_FATAL_ERROR);
+			else if (strcmp(args[0].from, "bug") == 0)
+				btrfs_clear_opt(info->mount_opt,
+					      PANIC_ON_FATAL_ERROR);
+			else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+		case Opt_commit_interval:
+			intarg = 0;
+			ret = match_int(&args[0], &intarg);
+			if (ret < 0) {
+				btrfs_err(root->fs_info, "invalid commit interval");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (intarg > 0) {
+				if (intarg > 300) {
+					btrfs_warn(root->fs_info, "excessive commit interval %d",
+							intarg);
+				}
+				info->commit_interval = intarg;
+			} else {
+				btrfs_info(root->fs_info, "using default commit interval %ds",
+				    BTRFS_DEFAULT_COMMIT_INTERVAL);
+				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+			}
+			break;
 		case Opt_err:
-			printk(KERN_INFO "btrfs: unrecognized mount option "
-			       "'%s'\n", p);
+			btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
 			goto out;
 		default:
@@ -373,6 +752,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		}
 	}
 out:
+	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+		btrfs_info(root->fs_info, "disk space caching is enabled");
 	kfree(orig);
 	return ret;
 }
@@ -388,12 +769,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *opts, *orig, *p;
+	char *device_name, *opts, *orig, *p;
+	char *num = NULL;
 	int error = 0;
-	int intarg;
 
 	if (!options)
-		goto out;
+		return 0;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -412,51 +793,59 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_subvol:
+			kfree(*subvol_name);
 			*subvol_name = match_strdup(&args[0]);
+			if (!*subvol_name) {
+				error = -ENOMEM;
+				goto out;
+			}
 			break;
 		case Opt_subvolid:
-			intarg = 0;
-			error = match_int(&args[0], &intarg);
-			if (!error) {
+			num = match_strdup(&args[0]);
+			if (num) {
+				*subvol_objectid = memparse(num, NULL);
+				kfree(num);
 				/* we want the original fs_tree */
-				if (!intarg)
+				if (!*subvol_objectid)
 					*subvol_objectid =
 						BTRFS_FS_TREE_OBJECTID;
-				else
-					*subvol_objectid = intarg;
+			} else {
+				error = -EINVAL;
+				goto out;
 			}
 			break;
+		case Opt_subvolrootid:
+			printk(KERN_WARNING
+				"BTRFS: 'subvolrootid' mount option is deprecated and has "
+				"no effect\n");
+			break;
 		case Opt_device:
-			error = btrfs_scan_one_device(match_strdup(&args[0]),
+			device_name = match_strdup(&args[0]);
+			if (!device_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			error = btrfs_scan_one_device(device_name,
 					flags, holder, fs_devices);
+			kfree(device_name);
 			if (error)
-				goto out_free_opts;
+				goto out;
 			break;
 		default:
 			break;
 		}
 	}
 
- out_free_opts:
+out:
 	kfree(orig);
- out:
-	/*
-	 * If no subvolume name is specified we use the default one.  Allocate
-	 * a copy of the string "." here so that code later in the
-	 * mount path doesn't care if it's the default volume or another one.
-	 */
-	if (!*subvol_name) {
-		*subvol_name = kstrdup(".", GFP_KERNEL);
-		if (!*subvol_name)
-			return -ENOMEM;
-	}
 	return error;
 }
 
 static struct dentry *get_default_root(struct super_block *sb,
 				       u64 subvol_objectid)
 {
-	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
@@ -487,10 +876,12 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
-	if (IS_ERR(di))
+	if (IS_ERR(di)) {
+		btrfs_free_path(path);
 		return ERR_CAST(di);
+	}
 	if (!di) {
 		/*
 		 * Ok the default dir item isn't there.  This is weird since
@@ -499,7 +890,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 		 */
 		btrfs_free_path(path);
 		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = root->fs_info->fs_root;
+		new_root = fs_info->fs_root;
 		goto setup_root;
 	}
 
@@ -507,13 +898,10 @@ static struct dentry *get_default_root(struct super_block *sb,
 	btrfs_free_path(path);
 
 find_root:
-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
-	if (btrfs_root_refs(&new_root->root_item) == 0)
-		return ERR_PTR(-ENOENT);
-
 	dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
 	location.objectid = dir_id;
@@ -534,28 +922,12 @@ setup_root:
 		return dget(sb->s_root);
 	}
 
-	if (new) {
-		const struct qstr name = { .name = "/", .len = 1 };
-
-		/*
-		 * New inode, we need to make the dentry a sibling of s_root so
-		 * everything gets cleaned up properly on unmount.
-		 */
-		dentry = d_alloc(sb->s_root, &name);
-		if (!dentry) {
-			iput(inode);
-			return ERR_PTR(-ENOMEM);
-		}
-		d_splice_alias(inode, dentry);
-	} else {
-		/*
-		 * We found the inode in cache, just find a dentry for it and
-		 * put the reference to the inode we just got.
-		 */
-		dentry = d_find_alias(inode);
-		iput(inode);
+	dentry = d_obtain_alias(inode);
+	if (!IS_ERR(dentry)) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_DISCONNECTED;
+		spin_unlock(&dentry->d_lock);
 	}
-
 	return dentry;
 }
 
@@ -564,8 +936,7 @@ static int btrfs_fill_super(struct super_block *sb,
 			    void *data, int silent)
 {
 	struct inode *inode;
-	struct dentry *root_dentry;
-	struct btrfs_root *tree_root;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_key key;
 	int err;
 
@@ -579,66 +950,68 @@ static int btrfs_fill_super(struct super_block *sb,
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 	sb->s_flags |= MS_POSIXACL;
 #endif
-
-	tree_root = open_ctree(sb, fs_devices, (char *)data);
-
-	if (IS_ERR(tree_root)) {
-		printk("btrfs: open_ctree failed\n");
-		return PTR_ERR(tree_root);
+	sb->s_flags |= MS_I_VERSION;
+	err = open_ctree(sb, fs_devices, (char *)data);
+	if (err) {
+		printk(KERN_ERR "BTRFS: open_ctree failed\n");
+		return err;
 	}
-	sb->s_fs_info = tree_root;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
 	}
 
-	root_dentry = d_alloc_root(inode);
-	if (!root_dentry) {
-		iput(inode);
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
 		err = -ENOMEM;
 		goto fail_close;
 	}
 
-	sb->s_root = root_dentry;
-
 	save_mount_options(sb, data);
+	cleancache_init_fs(sb);
+	sb->s_flags |= MS_ACTIVE;
 	return 0;
 
 fail_close:
-	close_ctree(tree_root);
+	close_ctree(fs_info->tree_root);
 	return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
+
+	trace_btrfs_sync_fs(wait);
 
 	if (!wait) {
-		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		filemap_flush(fs_info->btree_inode->i_mapping);
 		return 0;
 	}
 
-	btrfs_start_delalloc_inodes(root, 0);
-	btrfs_wait_ordered_extents(root, 0, 0);
+	btrfs_wait_ordered_roots(fs_info, -1);
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
 		return PTR_ERR(trans);
-	ret = btrfs_commit_transaction(trans, root);
-	return ret;
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
-static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
-	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+	struct btrfs_root *root = info->tree_root;
+	char *compress_type;
 
 	if (btrfs_test_opt(root, DEGRADED))
 		seq_puts(seq, ",degraded");
@@ -649,16 +1022,22 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	if (info->max_inline != 8192 * 1024)
-		seq_printf(seq, ",max_inline=%llu",
-			   (unsigned long long)info->max_inline);
+		seq_printf(seq, ",max_inline=%llu", info->max_inline);
 	if (info->alloc_start != 0)
-		seq_printf(seq, ",alloc_start=%llu",
-			   (unsigned long long)info->alloc_start);
+		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
 	if (info->thread_pool_size !=  min_t(unsigned long,
 					     num_online_cpus() + 2, 8))
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
-	if (btrfs_test_opt(root, COMPRESS))
-		seq_puts(seq, ",compress");
+	if (btrfs_test_opt(root, COMPRESS)) {
+		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+			compress_type = "zlib";
+		else
+			compress_type = "lzo";
+		if (btrfs_test_opt(root, FORCE_COMPRESS))
+			seq_printf(seq, ",compress-force=%s", compress_type);
+		else
+			seq_printf(seq, ",compress=%s", compress_type);
+	}
 	if (btrfs_test_opt(root, NOSSD))
 		seq_puts(seq, ",nossd");
 	if (btrfs_test_opt(root, SSD_SPREAD))
@@ -673,30 +1052,175 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",discard");
 	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
 		seq_puts(seq, ",noacl");
+	if (btrfs_test_opt(root, SPACE_CACHE))
+		seq_puts(seq, ",space_cache");
+	else
+		seq_puts(seq, ",nospace_cache");
+	if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+		seq_puts(seq, ",rescan_uuid_tree");
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		seq_puts(seq, ",clear_cache");
+	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+		seq_puts(seq, ",user_subvol_rm_allowed");
+	if (btrfs_test_opt(root, ENOSPC_DEBUG))
+		seq_puts(seq, ",enospc_debug");
+	if (btrfs_test_opt(root, AUTO_DEFRAG))
+		seq_puts(seq, ",autodefrag");
+	if (btrfs_test_opt(root, INODE_MAP_CACHE))
+		seq_puts(seq, ",inode_cache");
+	if (btrfs_test_opt(root, SKIP_BALANCE))
+		seq_puts(seq, ",skip_balance");
+	if (btrfs_test_opt(root, RECOVERY))
+		seq_puts(seq, ",recovery");
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+		seq_puts(seq, ",check_int_data");
+	else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		seq_puts(seq, ",check_int");
+	if (info->check_integrity_print_mask)
+		seq_printf(seq, ",check_int_print_mask=%d",
+				info->check_integrity_print_mask);
+#endif
+	if (info->metadata_ratio)
+		seq_printf(seq, ",metadata_ratio=%d",
+				info->metadata_ratio);
+	if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+		seq_puts(seq, ",fatal_errors=panic");
+	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
+		seq_printf(seq, ",commit=%d", info->commit_interval);
 	return 0;
 }
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-	struct btrfs_root *test_root = data;
-	struct btrfs_root *root = btrfs_sb(s);
+	struct btrfs_fs_info *p = data;
+	struct btrfs_fs_info *fs_info = btrfs_sb(s);
 
-	/*
-	 * If this super block is going away, return false as it
-	 * can't match as an existing super block.
-	 */
-	if (!atomic_read(&s->s_active))
-		return 0;
-	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+	return fs_info->fs_devices == p->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-	s->s_fs_info = data;
+	int err = set_anon_super(s, data);
+	if (!err)
+		s->s_fs_info = data;
+	return err;
+}
+
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+	unsigned len = strlen(args) + 2 + 1;
+	char *src, *dst, *buf;
 
-	return set_anon_super(s, data);
+	/*
+	 * We need the same args as before, but with this substitution:
+	 * s!subvol=[^,]+!subvolid=0!
+	 *
+	 * Since the replacement string is up to 2 bytes longer than the
+	 * original, allocate strlen(args) + 2 + 1 bytes.
+	 */
+
+	src = strstr(args, "subvol=");
+	/* This shouldn't happen, but just in case.. */
+	if (!src)
+		return NULL;
+
+	buf = dst = kmalloc(len, GFP_NOFS);
+	if (!buf)
+		return NULL;
+
+	/*
+	 * If the subvol= arg is not at the start of the string,
+	 * copy whatever precedes it into buf.
+	 */
+	if (src != args) {
+		*src++ = '\0';
+		strcpy(buf, args);
+		dst += strlen(args);
+	}
+
+	strcpy(dst, "subvolid=0");
+	dst += strlen("subvolid=0");
+
+	/*
+	 * If there is a "," after the original subvol=... string,
+	 * copy that suffix into our buffer.  Otherwise, we're done.
+	 */
+	src = strchr(src, ',');
+	if (src)
+		strcpy(dst, src);
+
+	return buf;
 }
 
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+				   const char *device_name, char *data)
+{
+	struct dentry *root;
+	struct vfsmount *mnt;
+	char *newargs;
+
+	newargs = setup_root_args(data);
+	if (!newargs)
+		return ERR_PTR(-ENOMEM);
+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+			     newargs);
+
+	if (PTR_RET(mnt) == -EBUSY) {
+		if (flags & MS_RDONLY) {
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+					     newargs);
+		} else {
+			int r;
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+					     newargs);
+			if (IS_ERR(mnt)) {
+				kfree(newargs);
+				return ERR_CAST(mnt);
+			}
+
+			r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+			if (r < 0) {
+				/* FIXME: release vfsmount mnt ??*/
+				kfree(newargs);
+				return ERR_PTR(r);
+			}
+		}
+	}
+
+	kfree(newargs);
+
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	root = mount_subtree(mnt, subvol_name);
+
+	if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+		struct super_block *s = root->d_sb;
+		dput(root);
+		root = ERR_PTR(-EINVAL);
+		deactivate_locked_super(s);
+		printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
+				subvol_name);
+	}
+
+	return root;
+}
 
 /*
  * Find a superblock for the given device / mount point.
@@ -705,13 +1229,12 @@ static int btrfs_set_super(struct super_block *s, void *data)
  *	  for multiple device setup.  Make sure to keep it in sync.
  */
 static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data)
+		const char *device_name, void *data)
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
-	struct btrfs_root *tree_root = NULL;
 	struct btrfs_fs_info *fs_info = NULL;
 	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
@@ -724,21 +1247,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &subvol_objectid,
 					  &fs_devices);
-	if (error)
+	if (error) {
+		kfree(subvol_name);
 		return ERR_PTR(error);
+	}
 
-	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
-	if (error)
-		goto error_free_subvol_name;
+	if (subvol_name) {
+		root = mount_subvol(subvol_name, flags, device_name, data);
+		kfree(subvol_name);
+		return root;
+	}
 
-	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
 	if (error)
-		goto error_free_subvol_name;
-
-	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
-		error = -EACCES;
-		goto error_close_devices;
-	}
+		return ERR_PTR(error);
 
 	/*
 	 * Setup a dummy root and fs_info for test/set super.  This is because
@@ -747,129 +1269,281 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	 * then open_ctree will properly initialize everything later.
 	 */
 	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
-	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	if (!fs_info || !tree_root) {
+	if (!fs_info)
+		return ERR_PTR(-ENOMEM);
+
+	fs_info->fs_devices = fs_devices;
+
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
 		error = -ENOMEM;
+		goto error_fs_info;
+	}
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_fs_info;
+
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
 		goto error_close_devices;
 	}
-	fs_info->tree_root = tree_root;
-	fs_info->fs_devices = fs_devices;
-	tree_root->fs_info = fs_info;
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
-	if (IS_ERR(s))
-		goto error_s;
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+		 fs_info);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto error_close_devices;
+	}
 
 	if (s->s_root) {
-		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_close_devices;
-		}
-
 		btrfs_close_devices(fs_devices);
-		kfree(fs_info);
-		kfree(tree_root);
+		free_fs_info(fs_info);
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			error = -EBUSY;
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			goto error_free_subvol_name;
-		}
-
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
-		s->s_flags |= MS_ACTIVE;
 	}
 
-	root = get_default_root(s, subvol_objectid);
-	if (IS_ERR(root)) {
-		error = PTR_ERR(root);
+	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+	if (IS_ERR(root))
 		deactivate_locked_super(s);
-		goto error_free_subvol_name;
-	}
-	/* if they gave us a subvolume name bind mount into that */
-	if (strcmp(subvol_name, ".")) {
-		struct dentry *new_root;
-		mutex_lock(&root->d_inode->i_mutex);
-		new_root = lookup_one_len(subvol_name, root,
-				      strlen(subvol_name));
-		mutex_unlock(&root->d_inode->i_mutex);
-
-		if (IS_ERR(new_root)) {
-			dput(root);
-			deactivate_locked_super(s);
-			error = PTR_ERR(new_root);
-			goto error_free_subvol_name;
-		}
-		if (!new_root->d_inode) {
-			dput(root);
-			dput(new_root);
-			deactivate_locked_super(s);
-			error = -ENXIO;
-			goto error_free_subvol_name;
-		}
-		dput(root);
-		root = new_root;
-	}
 
-	kfree(subvol_name);
 	return root;
 
-error_s:
-	error = PTR_ERR(s);
 error_close_devices:
 	btrfs_close_devices(fs_devices);
-	kfree(fs_info);
-	kfree(tree_root);
-error_free_subvol_name:
-	kfree(subvol_name);
+error_fs_info:
+	free_fs_info(fs_info);
 	return ERR_PTR(error);
 }
 
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+				     int new_pool_size, int old_pool_size)
+{
+	if (new_pool_size == old_pool_size)
+		return;
+
+	fs_info->thread_pool_size = new_pool_size;
+
+	btrfs_info(fs_info, "resize thread pool %d -> %d",
+	       old_pool_size, new_pool_size);
+
+	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+				new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+				new_pool_size);
+}
+
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
+{
+	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+				       unsigned long old_opts, int flags)
+{
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (flags & MS_RDONLY))) {
+		/* wait for any defraggers to finish */
+		wait_event(fs_info->transaction_wait,
+			   (atomic_read(&fs_info->defrag_running) == 0));
+		if (flags & MS_RDONLY)
+			sync_filesystem(fs_info->sb);
+	}
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+					 unsigned long old_opts)
+{
+	/*
+	 * We need cleanup all defragable inodes if the autodefragment is
+	 * close or the fs is R/O.
+	 */
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (fs_info->sb->s_flags & MS_RDONLY))) {
+		btrfs_cleanup_defrag_inodes(fs_info);
+	}
+
+	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
+	unsigned old_flags = sb->s_flags;
+	unsigned long old_opts = fs_info->mount_opt;
+	unsigned long old_compress_type = fs_info->compress_type;
+	u64 old_max_inline = fs_info->max_inline;
+	u64 old_alloc_start = fs_info->alloc_start;
+	int old_thread_pool_size = fs_info->thread_pool_size;
+	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 
+	sync_filesystem(sb);
+	btrfs_remount_prepare(fs_info);
+
 	ret = btrfs_parse_options(root, data);
-	if (ret)
-		return -EINVAL;
+	if (ret) {
+		ret = -EINVAL;
+		goto restore;
+	}
+
+	btrfs_remount_begin(fs_info, old_opts, *flags);
+	btrfs_resize_thread_pool(fs_info,
+		fs_info->thread_pool_size, old_thread_pool_size);
 
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-		return 0;
+		goto out;
 
 	if (*flags & MS_RDONLY) {
+		/*
+		 * this also happens on 'umount -rf' or on shutdown, when
+		 * the filesystem is busy.
+		 */
+		cancel_work_sync(&fs_info->async_reclaim_work);
+
+		/* wait for the uuid_scan task to finish */
+		down(&fs_info->uuid_tree_rescan_sem);
+		/* avoid complains from lockdep et al. */
+		up(&fs_info->uuid_tree_rescan_sem);
+
 		sb->s_flags |= MS_RDONLY;
 
-		ret =  btrfs_commit_super(root);
-		WARN_ON(ret);
+		btrfs_dev_replace_suspend_for_unmount(fs_info);
+		btrfs_scrub_cancel(fs_info);
+		btrfs_pause_balance(fs_info);
+
+		ret = btrfs_commit_super(root);
+		if (ret)
+			goto restore;
 	} else {
-		if (root->fs_info->fs_devices->rw_devices == 0)
-			return -EACCES;
+		if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+			btrfs_err(fs_info,
+				"Remounting read-write after error is not allowed");
+			ret = -EINVAL;
+			goto restore;
+		}
+		if (fs_info->fs_devices->rw_devices == 0) {
+			ret = -EACCES;
+			goto restore;
+		}
+
+		if (fs_info->fs_devices->missing_devices >
+		     fs_info->num_tolerated_disk_barrier_failures &&
+		    !(*flags & MS_RDONLY)) {
+			btrfs_warn(fs_info,
+				"too many missing devices, writeable remount is not allowed");
+			ret = -EACCES;
+			goto restore;
+		}
 
-		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
-			return -EINVAL;
+		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+			ret = -EINVAL;
+			goto restore;
+		}
 
-		ret = btrfs_cleanup_fs_roots(root->fs_info);
-		WARN_ON(ret);
+		ret = btrfs_cleanup_fs_roots(fs_info);
+		if (ret)
+			goto restore;
 
 		/* recover relocation */
+		mutex_lock(&fs_info->cleaner_mutex);
 		ret = btrfs_recover_relocation(root);
-		WARN_ON(ret);
+		mutex_unlock(&fs_info->cleaner_mutex);
+		if (ret)
+			goto restore;
+
+		ret = btrfs_resume_balance_async(fs_info);
+		if (ret)
+			goto restore;
+
+		ret = btrfs_resume_dev_replace_async(fs_info);
+		if (ret) {
+			btrfs_warn(fs_info, "failed to resume dev_replace");
+			goto restore;
+		}
 
+		if (!fs_info->uuid_root) {
+			btrfs_info(fs_info, "creating UUID tree");
+			ret = btrfs_create_uuid_tree(fs_info);
+			if (ret) {
+				btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
+				goto restore;
+			}
+		}
 		sb->s_flags &= ~MS_RDONLY;
 	}
+out:
+	wake_up_process(fs_info->transaction_kthread);
+	btrfs_remount_cleanup(fs_info, old_opts);
+	return 0;
+
+restore:
+	/* We've hit an error - don't reset MS_RDONLY */
+	if (sb->s_flags & MS_RDONLY)
+		old_flags |= MS_RDONLY;
+	sb->s_flags = old_flags;
+	fs_info->mount_opt = old_opts;
+	fs_info->compress_type = old_compress_type;
+	fs_info->max_inline = old_max_inline;
+	mutex_lock(&fs_info->chunk_mutex);
+	fs_info->alloc_start = old_alloc_start;
+	mutex_unlock(&fs_info->chunk_mutex);
+	btrfs_resize_thread_pool(fs_info,
+		old_thread_pool_size, fs_info->thread_pool_size);
+	fs_info->metadata_ratio = old_metadata_ratio;
+	btrfs_remount_cleanup(fs_info, old_opts);
+	return ret;
+}
 
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+				       const void *dev_info2)
+{
+	if (((struct btrfs_device_info *)dev_info1)->max_avail >
+	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return -1;
+	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+		return 1;
+	else
 	return 0;
 }
 
 /*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+					struct btrfs_device_info *devices,
+					size_t nr_devices)
+{
+	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_free_bytes, NULL);
+}
+
+/*
  * The helper to calc the free space on the devices that can be used to store
  * file data.
  */
@@ -884,34 +1558,39 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	u64 avail_space;
 	u64 used_space;
 	u64 min_stripe_size;
-	int min_stripes = 1;
+	int min_stripes = 1, num_stripes = 1;
 	int i = 0, nr_devices;
 	int ret;
 
-	nr_devices = fs_info->fs_devices->rw_devices;
+	nr_devices = fs_info->fs_devices->open_devices;
 	BUG_ON(!nr_devices);
 
-	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
 		return -ENOMEM;
 
 	/* calc min stripe number for data space alloction */
 	type = btrfs_get_alloc_profile(root, 1);
-	if (type & BTRFS_BLOCK_GROUP_RAID0)
+	if (type & BTRFS_BLOCK_GROUP_RAID0) {
 		min_stripes = 2;
-	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_stripes = nr_devices;
+	} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
 		min_stripes = 2;
-	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		num_stripes = 2;
+	} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
 		min_stripes = 4;
+		num_stripes = 4;
+	}
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
 		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
 	else
 		min_stripe_size = BTRFS_STRIPE_LEN;
 
-	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-		if (!device->in_fs_metadata)
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->in_fs_metadata || !device->bdev ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		avail_space = device->total_bytes - device->bytes_used;
@@ -972,13 +1651,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	i = nr_devices - 1;
 	avail_space = 0;
 	while (nr_devices >= min_stripes) {
+		if (num_stripes > nr_devices)
+			num_stripes = nr_devices;
+
 		if (devices_info[i].max_avail >= min_stripe_size) {
 			int j;
 			u64 alloc_size;
 
-			avail_space += devices_info[i].max_avail * min_stripes;
+			avail_space += devices_info[i].max_avail * num_stripes;
 			alloc_size = devices_info[i].max_avail;
-			for (j = i + 1 - min_stripes; j <= i; j++)
+			for (j = i + 1 - num_stripes; j <= i; j++)
 				devices_info[j].max_avail -= alloc_size;
 		}
 		i--;
@@ -992,18 +1674,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	struct list_head *head = &fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
 	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
-	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	__be32 *fsid = (__be32 *)fs_info->fsid;
 	int ret;
 
 	/* holding chunk_muext to avoid allocating new chunks */
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1022,14 +1704,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 	buf->f_bavail = total_free_data;
-	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
 	if (ret) {
-		mutex_unlock(&root->fs_info->chunk_mutex);
+		mutex_unlock(&fs_info->chunk_mutex);
 		return ret;
 	}
 	buf->f_bavail += total_free_data;
 	buf->f_bavail = buf->f_bavail >> bits;
-	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
@@ -1043,13 +1725,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static void btrfs_kill_super(struct super_block *sb)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	kill_anon_super(sb);
+	free_fs_info(fs_info);
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.mount		= btrfs_mount,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= btrfs_kill_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("btrfs");
 
 /*
  * used by btrfsctl to scan devices when no FS is mounted
@@ -1073,6 +1763,13 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
 					    &btrfs_fs_type, &fs_devices);
 		break;
+	case BTRFS_IOC_DEVICES_READY:
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+					    &btrfs_fs_type, &fs_devices);
+		if (ret)
+			break;
+		ret = !(fs_devices->num_devices == fs_devices->total_devices);
+		break;
 	}
 
 	kfree(vol);
@@ -1081,17 +1778,56 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_lock(&root->fs_info->transaction_kthread_mutex);
-	mutex_lock(&root->fs_info->cleaner_mutex);
-	return 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
+		return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_unlock(&root->fs_info->cleaner_mutex);
-	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	return 0;
+}
+
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+	struct btrfs_fs_devices *cur_devices;
+	struct btrfs_device *dev, *first_dev = NULL;
+	struct list_head *head;
+	struct rcu_string *name;
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	cur_devices = fs_info->fs_devices;
+	while (cur_devices) {
+		head = &cur_devices->devices;
+		list_for_each_entry(dev, head, dev_list) {
+			if (dev->missing)
+				continue;
+			if (!dev->name)
+				continue;
+			if (!first_dev || dev->devid < first_dev->devid)
+				first_dev = dev;
+		}
+		cur_devices = cur_devices->seed;
+	}
+
+	if (first_dev) {
+		rcu_read_lock();
+		name = rcu_dereference(first_dev->name);
+		seq_escape(m, name->str, " \t\n\\");
+		rcu_read_unlock();
+	} else {
+		WARN_ON(1);
+	}
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 	return 0;
 }
 
@@ -1101,8 +1837,8 @@ static const struct super_operations btrfs_super_ops = {
 	.put_super	= btrfs_put_super,
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
+	.show_devname	= btrfs_show_devname,
 	.write_inode	= btrfs_write_inode,
-	.dirty_inode	= btrfs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
@@ -1135,20 +1871,65 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk(KERN_INFO "misc_deregister failed for control device");
+		printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
+}
+
+static void btrfs_print_info(void)
+{
+	printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+			", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_ASSERT
+			", assert=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+			", integrity-checker=on"
+#endif
+			"\n");
+}
+
+static int btrfs_run_sanity_tests(void)
+{
+	int ret;
+
+	ret = btrfs_init_test_fs();
+	if (ret)
+		return ret;
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_buffer_operations();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_io();
+	if (ret)
+		goto out;
+	ret = btrfs_test_inodes();
+	if (ret)
+		goto out;
+	ret = btrfs_test_qgroups();
+out:
+	btrfs_destroy_test_fs();
+	return ret;
 }
 
 static int __init init_btrfs_fs(void)
 {
 	int err;
 
-	err = btrfs_init_sysfs();
+	err = btrfs_hash_init();
 	if (err)
 		return err;
 
-	err = btrfs_init_compress();
+	btrfs_props_init();
+
+	err = btrfs_init_sysfs();
 	if (err)
-		goto free_sysfs;
+		goto free_hash;
+
+	btrfs_init_compress();
 
 	err = btrfs_init_cachep();
 	if (err)
@@ -1162,19 +1943,56 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_extent_io;
 
-	err = btrfs_interface_init();
+	err = ordered_data_init();
 	if (err)
 		goto free_extent_map;
 
+	err = btrfs_delayed_inode_init();
+	if (err)
+		goto free_ordered_data;
+
+	err = btrfs_auto_defrag_init();
+	if (err)
+		goto free_delayed_inode;
+
+	err = btrfs_delayed_ref_init();
+	if (err)
+		goto free_auto_defrag;
+
+	err = btrfs_prelim_ref_init();
+	if (err)
+		goto free_prelim_ref;
+
+	err = btrfs_interface_init();
+	if (err)
+		goto free_delayed_ref;
+
+	btrfs_init_lockdep();
+
+	btrfs_print_info();
+
+	err = btrfs_run_sanity_tests();
+	if (err)
+		goto unregister_ioctl;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
 
-	printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
 	return 0;
 
 unregister_ioctl:
 	btrfs_interface_exit();
+free_prelim_ref:
+	btrfs_prelim_ref_exit();
+free_delayed_ref:
+	btrfs_delayed_ref_exit();
+free_auto_defrag:
+	btrfs_auto_defrag_exit();
+free_delayed_inode:
+	btrfs_delayed_inode_exit();
+free_ordered_data:
+	ordered_data_exit();
 free_extent_map:
 	extent_map_exit();
 free_extent_io:
@@ -1183,14 +2001,20 @@ free_cachep:
 	btrfs_destroy_cachep();
 free_compress:
 	btrfs_exit_compress();
-free_sysfs:
 	btrfs_exit_sysfs();
+free_hash:
+	btrfs_hash_exit();
 	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
 {
 	btrfs_destroy_cachep();
+	btrfs_delayed_ref_exit();
+	btrfs_auto_defrag_exit();
+	btrfs_delayed_inode_exit();
+	btrfs_prelim_ref_exit();
+	ordered_data_exit();
 	extent_map_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
@@ -1198,9 +2022,10 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
 	btrfs_exit_compress();
+	btrfs_hash_exit();
 }
 
-module_init(init_btrfs_fs)
+late_initcall(init_btrfs_fs);
 module_exit(exit_btrfs_fs)
 
 MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4ce16ef702a..78699364f53 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,249 +21,733 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
+#include <linux/bug.h>
+#include <linux/genhd.h>
+#include <linux/debugfs.h>
 
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "sysfs.h"
+#include "volumes.h"
 
-static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+
+static u64 get_features(struct btrfs_fs_info *fs_info,
+			enum btrfs_feature_set set)
 {
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_root_used(&root->root_item));
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	if (set == FEAT_COMPAT)
+		return btrfs_super_compat_flags(disk_super);
+	else if (set == FEAT_COMPAT_RO)
+		return btrfs_super_compat_ro_flags(disk_super);
+	else
+		return btrfs_super_incompat_flags(disk_super);
 }
 
-static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+static void set_features(struct btrfs_fs_info *fs_info,
+			 enum btrfs_feature_set set, u64 features)
 {
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_root_limit(&root->root_item));
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	if (set == FEAT_COMPAT)
+		btrfs_set_super_compat_flags(disk_super, features);
+	else if (set == FEAT_COMPAT_RO)
+		btrfs_set_super_compat_ro_flags(disk_super, features);
+	else
+		btrfs_set_super_incompat_flags(disk_super, features);
 }
 
-static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+static int can_modify_feature(struct btrfs_feature_attr *fa)
 {
+	int val = 0;
+	u64 set, clear;
+	switch (fa->feature_set) {
+	case FEAT_COMPAT:
+		set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+		break;
+	case FEAT_COMPAT_RO:
+		set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+		break;
+	case FEAT_INCOMPAT:
+		set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+		break;
+	default:
+		printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
+				fa->feature_set);
+		return 0;
+	}
+
+	if (set & fa->feature_bit)
+		val |= 1;
+	if (clear & fa->feature_bit)
+		val |= 2;
 
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
+	return val;
 }
 
-static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
+	int val = 0;
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+	if (fs_info) {
+		u64 features = get_features(fs_info, fa->feature_set);
+		if (features & fa->feature_bit)
+			val = 1;
+	} else
+		val = can_modify_feature(fa);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
 }
 
-static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+					struct kobj_attribute *a,
+					const char *buf, size_t count)
 {
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+	struct btrfs_trans_handle *trans;
+	u64 features, set, clear;
+	unsigned long val;
+	int ret;
+
+	fs_info = to_fs_info(kobj);
+	if (!fs_info)
+		return -EPERM;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &val);
+	if (ret)
+		return ret;
+
+	if (fa->feature_set == FEAT_COMPAT) {
+		set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+	} else if (fa->feature_set == FEAT_COMPAT_RO) {
+		set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+		clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+	} else {
+		set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+		clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+	}
+
+	features = get_features(fs_info, fa->feature_set);
+
+	/* Nothing to do */
+	if ((val && (features & fa->feature_bit)) ||
+	    (!val && !(features & fa->feature_bit)))
+		return count;
+
+	if ((val && !(set & fa->feature_bit)) ||
+	    (!val && !(clear & fa->feature_bit))) {
+		btrfs_info(fs_info,
+			"%sabling feature %s on mounted fs is not supported.",
+			val ? "En" : "Dis", fa->kobj_attr.attr.name);
+		return -EPERM;
+	}
+
+	btrfs_info(fs_info, "%s %s feature flag",
+		   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+
+	trans = btrfs_start_transaction(fs_info->fs_root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&fs_info->super_lock);
+	features = get_features(fs_info, fa->feature_set);
+	if (val)
+		features |= fa->feature_bit;
+	else
+		features &= ~fa->feature_bit;
+	set_features(fs_info, fa->feature_set, features);
+	spin_unlock(&fs_info->super_lock);
+
+	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+	if (ret)
+		return ret;
+
+	return count;
 }
 
-/* this is for root attrs (subvols/snapshots) */
-struct btrfs_root_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct btrfs_root *, char *);
-	ssize_t (*store)(struct btrfs_root *, const char *, size_t);
-};
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+				     struct attribute *attr, int unused)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	umode_t mode = attr->mode;
 
-#define ROOT_ATTR(name, mode, show, store) \
-static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
-							      show, store)
+	if (fs_info) {
+		struct btrfs_feature_attr *fa;
+		u64 features;
 
-ROOT_ATTR(blocks_used,	0444,	root_blocks_used_show,	NULL);
-ROOT_ATTR(block_limit,	0644,	root_block_limit_show,	NULL);
+		fa = attr_to_btrfs_feature_attr(attr);
+		features = get_features(fs_info, fa->feature_set);
 
-static struct attribute *btrfs_root_attrs[] = {
-	&btrfs_root_attr_blocks_used.attr,
-	&btrfs_root_attr_block_limit.attr,
-	NULL,
+		if (can_modify_feature(fa))
+			mode |= S_IWUSR;
+		else if (!(features & fa->feature_bit))
+			mode = 0;
+	}
+
+	return mode;
+}
+
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+
+static struct attribute *btrfs_supported_feature_attrs[] = {
+	BTRFS_FEAT_ATTR_PTR(mixed_backref),
+	BTRFS_FEAT_ATTR_PTR(default_subvol),
+	BTRFS_FEAT_ATTR_PTR(mixed_groups),
+	BTRFS_FEAT_ATTR_PTR(compress_lzo),
+	BTRFS_FEAT_ATTR_PTR(big_metadata),
+	BTRFS_FEAT_ATTR_PTR(extended_iref),
+	BTRFS_FEAT_ATTR_PTR(raid56),
+	BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+	BTRFS_FEAT_ATTR_PTR(no_holes),
+	NULL
 };
 
-/* this is for super attrs (actual full fs) */
-struct btrfs_super_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct btrfs_fs_info *, char *);
-	ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+static const struct attribute_group btrfs_feature_attr_group = {
+	.name = "features",
+	.is_visible = btrfs_feature_visible,
+	.attrs = btrfs_supported_feature_attrs,
 };
 
-#define SUPER_ATTR(name, mode, show, store) \
-static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
-								show, store)
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+	u64 val;
+	if (lock)
+		spin_lock(lock);
+	val = *value_ptr;
+	if (lock)
+		spin_unlock(lock);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
 
-SUPER_ATTR(blocks_used,		0444,	super_blocks_used_show,		NULL);
-SUPER_ATTR(total_blocks,	0444,	super_total_blocks_show,	NULL);
-SUPER_ATTR(blocksize,		0444,	super_blocksize_show,		NULL);
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+				    struct kobj_attribute *ka, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
 
-static struct attribute *btrfs_super_attrs[] = {
-	&btrfs_super_attr_blocks_used.attr,
-	&btrfs_super_attr_total_blocks.attr,
-	&btrfs_super_attr_blocksize.attr,
-	NULL,
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+					struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf);
+BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
+BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+	struct btrfs_block_group_cache *block_group;
+	int index = to_raid_kobj(kobj)->raid_type;
+	u64 val = 0;
+
+	down_read(&sinfo->groups_sem);
+	list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+		if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+			val += block_group->key.offset;
+		else
+			val += btrfs_block_group_used(&block_group->item);
+	}
+	up_read(&sinfo->groups_sem);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static struct attribute *raid_attributes[] = {
+	BTRFS_RAID_ATTR_PTR(total_bytes),
+	BTRFS_RAID_ATTR_PTR(used_bytes),
+	NULL
 };
 
-static ssize_t btrfs_super_attr_show(struct kobject *kobj,
-				    struct attribute *attr, char *buf)
+static void release_raid_kobj(struct kobject *kobj)
 {
-	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-						super_kobj);
-	struct btrfs_super_attr *a = container_of(attr,
-						  struct btrfs_super_attr,
-						  attr);
+	kfree(to_raid_kobj(kobj));
+}
 
-	return a->show ? a->show(fs, buf) : 0;
+struct kobj_type btrfs_raid_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = release_raid_kobj,
+	.default_attrs = raid_attributes,
+};
+
+#define SPACE_INFO_ATTR(field)						\
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
+					     struct kobj_attribute *a,	\
+					     char *buf)			\
+{									\
+	struct btrfs_space_info *sinfo = to_space_info(kobj);		\
+	return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf);	\
+}									\
+BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+
+static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
+						       struct kobj_attribute *a,
+						       char *buf)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
+	return snprintf(buf, PAGE_SIZE, "%lld\n", val);
 }
 
-static ssize_t btrfs_super_attr_store(struct kobject *kobj,
-				     struct attribute *attr,
-				     const char *buf, size_t len)
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+
+static struct attribute *space_info_attrs[] = {
+	BTRFS_ATTR_PTR(flags),
+	BTRFS_ATTR_PTR(total_bytes),
+	BTRFS_ATTR_PTR(bytes_used),
+	BTRFS_ATTR_PTR(bytes_pinned),
+	BTRFS_ATTR_PTR(bytes_reserved),
+	BTRFS_ATTR_PTR(bytes_may_use),
+	BTRFS_ATTR_PTR(disk_used),
+	BTRFS_ATTR_PTR(disk_total),
+	BTRFS_ATTR_PTR(total_bytes_pinned),
+	NULL,
+};
+
+static void space_info_release(struct kobject *kobj)
 {
-	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-						super_kobj);
-	struct btrfs_super_attr *a = container_of(attr,
-						  struct btrfs_super_attr,
-						  attr);
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	percpu_counter_destroy(&sinfo->total_bytes_pinned);
+	kfree(sinfo);
+}
+
+struct kobj_type space_info_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = space_info_release,
+	.default_attrs = space_info_attrs,
+};
 
-	return a->store ? a->store(fs, buf, len) : 0;
+static const struct attribute *allocation_attrs[] = {
+	BTRFS_ATTR_PTR(global_rsv_reserved),
+	BTRFS_ATTR_PTR(global_rsv_size),
+	NULL,
+};
+
+static ssize_t btrfs_label_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
 }
 
-static ssize_t btrfs_root_attr_show(struct kobject *kobj,
-				    struct attribute *attr, char *buf)
+static ssize_t btrfs_label_store(struct kobject *kobj,
+				 struct kobj_attribute *a,
+				 const char *buf, size_t len)
 {
-	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-						root_kobj);
-	struct btrfs_root_attr *a = container_of(attr,
-						 struct btrfs_root_attr,
-						 attr);
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = fs_info->fs_root;
+	int ret;
+
+	if (len >= BTRFS_LABEL_SIZE)
+		return -EINVAL;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	spin_lock(&root->fs_info->super_lock);
+	strcpy(fs_info->super_copy->label, buf);
+	spin_unlock(&root->fs_info->super_lock);
+	ret = btrfs_commit_transaction(trans, root);
+
+	if (!ret)
+		return len;
+
+	return ret;
+}
+BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
 
-	return a->show ? a->show(root, buf) : 0;
+static ssize_t btrfs_no_store(struct kobject *kobj,
+				 struct kobj_attribute *a,
+				 const char *buf, size_t len)
+{
+	return -EPERM;
 }
 
-static ssize_t btrfs_root_attr_store(struct kobject *kobj,
-				     struct attribute *attr,
-				     const char *buf, size_t len)
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
 {
-	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-						root_kobj);
-	struct btrfs_root_attr *a = container_of(attr,
-						 struct btrfs_root_attr,
-						 attr);
-	return a->store ? a->store(root, buf, len) : 0;
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
 }
 
-static void btrfs_super_release(struct kobject *kobj)
+BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
 {
-	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-						super_kobj);
-	complete(&fs->kobj_unregister);
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
 }
 
-static void btrfs_root_release(struct kobject *kobj)
+BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+				struct kobj_attribute *a, char *buf)
 {
-	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-						root_kobj);
-	complete(&root->kobj_unregister);
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
 }
 
-static const struct sysfs_ops btrfs_super_attr_ops = {
-	.show	= btrfs_super_attr_show,
-	.store	= btrfs_super_attr_store,
-};
+BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
 
-static const struct sysfs_ops btrfs_root_attr_ops = {
-	.show	= btrfs_root_attr_show,
-	.store	= btrfs_root_attr_store,
+static struct attribute *btrfs_attrs[] = {
+	BTRFS_ATTR_PTR(label),
+	BTRFS_ATTR_PTR(nodesize),
+	BTRFS_ATTR_PTR(sectorsize),
+	BTRFS_ATTR_PTR(clone_alignment),
+	NULL,
 };
 
-static struct kobj_type btrfs_root_ktype = {
-	.default_attrs	= btrfs_root_attrs,
-	.sysfs_ops	= &btrfs_root_attr_ops,
-	.release	= btrfs_root_release,
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	complete(&fs_info->kobj_unregister);
+}
+
+static struct kobj_type btrfs_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.release	= btrfs_release_super_kobj,
+	.default_attrs	= btrfs_attrs,
 };
 
-static struct kobj_type btrfs_super_ktype = {
-	.default_attrs	= btrfs_super_attrs,
-	.sysfs_ops	= &btrfs_super_attr_ops,
-	.release	= btrfs_super_release,
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+	if (kobj->ktype != &btrfs_ktype)
+		return NULL;
+	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
+
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+
+static u64 supported_feature_masks[3] = {
+	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
+	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
 };
 
-/* /sys/fs/btrfs/ entry */
-static struct kset *btrfs_kset;
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+	int set;
+
+	for (set = 0; set < FEAT_MAX; set++) {
+		int i;
+		struct attribute *attrs[2];
+		struct attribute_group agroup = {
+			.name = "features",
+			.attrs = attrs,
+		};
+		u64 features = get_features(fs_info, set);
+		features &= ~supported_feature_masks[set];
+
+		if (!features)
+			continue;
+
+		attrs[1] = NULL;
+		for (i = 0; i < NUM_FEATURE_BITS; i++) {
+			struct btrfs_feature_attr *fa;
+
+			if (!(features & (1ULL << i)))
+				continue;
+
+			fa = &btrfs_feature_attrs[set][i];
+			attrs[0] = &fa->kobj_attr.attr;
+			if (add) {
+				int ret;
+				ret = sysfs_merge_group(&fs_info->super_kobj,
+							&agroup);
+				if (ret)
+					return ret;
+			} else
+				sysfs_unmerge_group(&fs_info->super_kobj,
+						    &agroup);
+		}
+
+	}
+	return 0;
+}
 
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
-	int error;
-	char *name;
-	char c;
-	int len = strlen(fs->sb->s_id) + 1;
+	kobject_del(&fs_info->super_kobj);
+	kobject_put(&fs_info->super_kobj);
+	wait_for_completion(&fs_info->kobj_unregister);
+}
+
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->space_info_kobj) {
+		sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+		kobject_del(fs_info->space_info_kobj);
+		kobject_put(fs_info->space_info_kobj);
+	}
+	kobject_del(fs_info->device_dir_kobj);
+	kobject_put(fs_info->device_dir_kobj);
+	addrm_unknown_feature_attrs(fs_info, false);
+	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+	__btrfs_sysfs_remove_one(fs_info);
+}
+
+const char * const btrfs_feature_set_names[3] = {
+	[FEAT_COMPAT]	 = "compat",
+	[FEAT_COMPAT_RO] = "compat_ro",
+	[FEAT_INCOMPAT]	 = "incompat",
+};
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+	size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+	int len = 0;
 	int i;
+	char *str;
 
-	name = kmalloc(len, GFP_NOFS);
-	if (!name) {
-		error = -ENOMEM;
-		goto fail;
+	str = kmalloc(bufsize, GFP_KERNEL);
+	if (!str)
+		return str;
+
+	for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+		const char *name;
+
+		if (!(flags & (1ULL << i)))
+			continue;
+
+		name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+		len += snprintf(str + len, bufsize - len, "%s%s",
+				len ? "," : "", name);
 	}
 
-	for (i = 0; i < len; i++) {
-		c = fs->sb->s_id[i];
-		if (c == '/' || c == '\\')
-			c = '!';
-		name[i] = c;
+	return str;
+}
+
+static void init_feature_attrs(void)
+{
+	struct btrfs_feature_attr *fa;
+	int set, i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+		     ARRAY_SIZE(btrfs_feature_attrs));
+	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+		     ARRAY_SIZE(btrfs_feature_attrs[0]));
+
+	memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+	memset(btrfs_unknown_feature_names, 0,
+	       sizeof(btrfs_unknown_feature_names));
+
+	for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+		struct btrfs_feature_attr *sfa;
+		struct attribute *a = btrfs_supported_feature_attrs[i];
+		int bit;
+		sfa = attr_to_btrfs_feature_attr(a);
+		bit = ilog2(sfa->feature_bit);
+		fa = &btrfs_feature_attrs[sfa->feature_set][bit];
+
+		fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
 	}
-	name[len] = '\0';
 
-	fs->super_kobj.kset = btrfs_kset;
-	error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
-				     NULL, "%s", name);
-	kfree(name);
-	if (error)
-		goto fail;
+	for (set = 0; set < FEAT_MAX; set++) {
+		for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+			char *name = btrfs_unknown_feature_names[set][i];
+			fa = &btrfs_feature_attrs[set][i];
+
+			if (fa->kobj_attr.attr.name)
+				continue;
+
+			snprintf(name, 13, "%s:%u",
+				 btrfs_feature_set_names[set], i);
+
+			fa->kobj_attr.attr.name = name;
+			fa->kobj_attr.attr.mode = S_IRUGO;
+			fa->feature_set = set;
+			fa->feature_bit = 1ULL << i;
+		}
+	}
+}
+
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device)
+{
+	struct hd_struct *disk;
+	struct kobject *disk_kobj;
+
+	if (!fs_info->device_dir_kobj)
+		return -EINVAL;
+
+	if (one_device) {
+		disk = one_device->bdev->bd_part;
+		disk_kobj = &part_to_dev(disk)->kobj;
+
+		sysfs_remove_link(fs_info->device_dir_kobj,
+						disk_kobj->name);
+	}
 
 	return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device)
+{
+	int error = 0;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *dev;
+
+	if (!fs_info->device_dir_kobj)
+		fs_info->device_dir_kobj = kobject_create_and_add("devices",
+						&fs_info->super_kobj);
+
+	if (!fs_info->device_dir_kobj)
+		return -ENOMEM;
+
+	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+		struct hd_struct *disk;
+		struct kobject *disk_kobj;
+
+		if (!dev->bdev)
+			continue;
+
+		if (one_device && one_device != dev)
+			continue;
+
+		disk = dev->bdev->bd_part;
+		disk_kobj = &part_to_dev(disk)->kobj;
+
+		error = sysfs_create_link(fs_info->device_dir_kobj,
+					  disk_kobj, disk_kobj->name);
+		if (error)
+			break;
+	}
 
-fail:
-	printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
 	return error;
 }
 
-int btrfs_sysfs_add_root(struct btrfs_root *root)
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
 	int error;
 
-	error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
-				     &root->fs_info->super_kobj,
-				     "%s", root->name);
+	init_completion(&fs_info->kobj_unregister);
+	fs_info->super_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
+				     "%pU", fs_info->fsid);
 	if (error)
-		goto fail;
+		return error;
 
-	return 0;
+	error = sysfs_create_group(&fs_info->super_kobj,
+				   &btrfs_feature_attr_group);
+	if (error) {
+		__btrfs_sysfs_remove_one(fs_info);
+		return error;
+	}
+
+	error = addrm_unknown_feature_attrs(fs_info, true);
+	if (error)
+		goto failure;
+
+	error = btrfs_kobj_add_device(fs_info, NULL);
+	if (error)
+		goto failure;
 
-fail:
-	printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+	fs_info->space_info_kobj = kobject_create_and_add("allocation",
+						  &fs_info->super_kobj);
+	if (!fs_info->space_info_kobj) {
+		error = -ENOMEM;
+		goto failure;
+	}
+
+	error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+	if (error)
+		goto failure;
+
+	return 0;
+failure:
+	btrfs_sysfs_remove_one(fs_info);
 	return error;
 }
 
-void btrfs_sysfs_del_root(struct btrfs_root *root)
+static int btrfs_init_debugfs(void)
 {
-	kobject_put(&root->root_kobj);
-	wait_for_completion(&root->kobj_unregister);
-}
+#ifdef CONFIG_DEBUG_FS
+	btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+	if (!btrfs_debugfs_root_dentry)
+		return -ENOMEM;
 
-void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
-{
-	kobject_put(&fs->super_kobj);
-	wait_for_completion(&fs->kobj_unregister);
+	debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+			&btrfs_debugfs_test);
+#endif
+	return 0;
 }
 
 int btrfs_init_sysfs(void)
 {
+	int ret;
+
 	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
 	if (!btrfs_kset)
 		return -ENOMEM;
-	return 0;
+
+	ret = btrfs_init_debugfs();
+	if (ret)
+		return ret;
+
+	init_feature_attrs();
+	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+
+	return ret;
 }
 
 void btrfs_exit_sysfs(void)
 {
+	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
+	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
 }
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 00000000000..ac46df37504
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,73 @@
+#ifndef _BTRFS_SYSFS_H_
+#define _BTRFS_SYSFS_H_
+
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
+enum btrfs_feature_set {
+	FEAT_COMPAT,
+	FEAT_COMPAT_RO,
+	FEAT_INCOMPAT,
+	FEAT_MAX
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
+{									\
+	.attr	= { .name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,						\
+	.store	= _store,						\
+}
+
+#define BTRFS_ATTR_RW(_name, _mode, _show, _store)			\
+static struct kobj_attribute btrfs_attr_##_name =			\
+			__INIT_KOBJ_ATTR(_name, _mode, _show, _store)
+#define BTRFS_ATTR(_name, _mode, _show)					\
+	BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_PTR(_name)    (&btrfs_attr_##_name.attr)
+
+#define BTRFS_RAID_ATTR(_name, _show)					\
+static struct kobj_attribute btrfs_raid_attr_##_name =			\
+			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+#define BTRFS_RAID_ATTR_PTR(_name)    (&btrfs_raid_attr_##_name.attr)
+
+
+struct btrfs_feature_attr {
+	struct kobj_attribute kobj_attr;
+	enum btrfs_feature_set feature_set;
+	u64 feature_bit;
+};
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit)	     \
+static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
+	.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO,			     \
+				      btrfs_feature_attr_show,		     \
+				      btrfs_feature_attr_store),	     \
+	.feature_set	= _feature_set,					     \
+	.feature_bit	= _prefix ##_## _feature_bit,			     \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name)    (&btrfs_attr_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+
+/* convert from attribute */
+#define to_btrfs_feature_attr(a) \
+			container_of(a, struct btrfs_feature_attr, kobj_attr)
+#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
+#define attr_to_btrfs_feature_attr(a) \
+			to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+extern const char * const btrfs_feature_set_names[3];
+extern struct kobj_type space_info_ktype;
+extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+                struct btrfs_device *one_device);
+#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 00000000000..9626252ee6b
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static const struct super_operations btrfs_test_super_ops = {
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_test_destroy_inode,
+};
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *data)
+{
+	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+			    NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+	.name		= "btrfs_test_fs",
+	.mount		= btrfs_test_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+	return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+	int ret;
+
+	ret = register_filesystem(&test_type);
+	if (ret) {
+		printk(KERN_ERR "btrfs: cannot register test file system\n");
+		return ret;
+	}
+
+	test_mnt = kern_mount(&test_type);
+	if (IS_ERR(test_mnt)) {
+		printk(KERN_ERR "btrfs: cannot mount test file system\n");
+		unregister_filesystem(&test_type);
+		return ret;
+	}
+	return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+	kern_unmount(test_mnt);
+	unregister_filesystem(&test_type);
+}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+{
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+						GFP_NOFS);
+
+	if (!fs_info)
+		return fs_info;
+	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+				      GFP_NOFS);
+	if (!fs_info->fs_devices) {
+		kfree(fs_info);
+		return NULL;
+	}
+	fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+				      GFP_NOFS);
+	if (!fs_info->super_copy) {
+		kfree(fs_info->fs_devices);
+		kfree(fs_info);
+		return NULL;
+	}
+
+	if (init_srcu_struct(&fs_info->subvol_srcu)) {
+		kfree(fs_info->fs_devices);
+		kfree(fs_info->super_copy);
+		kfree(fs_info);
+		return NULL;
+	}
+
+	spin_lock_init(&fs_info->buffer_lock);
+	spin_lock_init(&fs_info->qgroup_lock);
+	spin_lock_init(&fs_info->qgroup_op_lock);
+	spin_lock_init(&fs_info->super_lock);
+	spin_lock_init(&fs_info->fs_roots_radix_lock);
+	spin_lock_init(&fs_info->tree_mod_seq_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
+	mutex_init(&fs_info->qgroup_rescan_lock);
+	rwlock_init(&fs_info->tree_mod_log_lock);
+	fs_info->running_transaction = NULL;
+	fs_info->qgroup_tree = RB_ROOT;
+	fs_info->qgroup_ulist = NULL;
+	atomic64_set(&fs_info->tree_mod_seq, 0);
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	return fs_info;
+}
+
+static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	spin_lock(&fs_info->buffer_lock);
+restart:
+	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+		struct extent_buffer *eb;
+
+		eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+		if (!eb)
+			continue;
+		/* Shouldn't happen but that kind of thinking creates CVE's */
+		if (radix_tree_exception(eb)) {
+			if (radix_tree_deref_retry(eb))
+				goto restart;
+			continue;
+		}
+		spin_unlock(&fs_info->buffer_lock);
+		free_extent_buffer_stale(eb);
+		spin_lock(&fs_info->buffer_lock);
+	}
+	spin_unlock(&fs_info->buffer_lock);
+
+	btrfs_free_qgroup_config(fs_info);
+	btrfs_free_fs_roots(fs_info);
+	cleanup_srcu_struct(&fs_info->subvol_srcu);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->fs_devices);
+	kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+	if (!root)
+		return;
+	if (root->node)
+		free_extent_buffer(root->node);
+	if (root->fs_info)
+		btrfs_free_dummy_fs_info(root->fs_info);
+	kfree(root);
+}
+
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
new file mode 100644
index 00000000000..fd395422448
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TESTS
+#define __BTRFS_TESTS
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+
+struct btrfs_root;
+
+int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
+int btrfs_test_qgroups(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_root(struct btrfs_root *root);
+#else
+static inline int btrfs_test_free_space_cache(void)
+{
+	return 0;
+}
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+	return 0;
+}
+static inline int btrfs_init_test_fs(void)
+{
+	return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+	return 0;
+}
+static inline int btrfs_test_inodes(void)
+{
+	return 0;
+}
+static inline int btrfs_test_qgroups(void)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 00000000000..cc286ce97d1
--- /dev/null
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	char *value = "mary had a little lamb";
+	char *split1 = "mary had a little";
+	char *split2 = " lamb";
+	char *split3 = "mary";
+	char *split4 = " had a little";
+	char buf[32];
+	struct btrfs_key key;
+	u32 value_len = strlen(value);
+	int ret = 0;
+
+	test_msg("Running btrfs_split_item tests\n");
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Could not allocate root\n");
+		return PTR_ERR(root);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Could not allocate path\n");
+		kfree(root);
+		return -ENOMEM;
+	}
+
+	path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+	if (!eb) {
+		test_msg("Could not allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->slots[0] = 0;
+
+	key.objectid = 0;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	item = btrfs_item_nr(0);
+	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+			    value_len);
+
+	key.offset = 3;
+
+	/*
+	 * Passing NULL trans here should be safe because we have plenty of
+	 * space in this leaf to split the item without having to split the
+	 * leaf.
+	 */
+	ret = btrfs_split_item(NULL, root, path, &key, 17);
+	if (ret) {
+		test_msg("Split item failed %d\n", ret);
+		goto out;
+	}
+
+	/*
+	 * Read the first slot, it should have the original key and contain only
+	 * 'mary had a little'
+	 */
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split1)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split1));
+	if (memcmp(buf, split1, strlen(split1))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the first split have='%.*s' want '%s'\n",
+			 (int)strlen(split1), buf, split1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key.offset = 1;
+	/* Do it again so we test memmoving the other items in the leaf */
+	ret = btrfs_split_item(NULL, root, path, &key, 4);
+	if (ret) {
+		test_msg("Second split item failed %d\n", ret);
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split3)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split3));
+	if (memcmp(buf, split3, strlen(split3))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the third split");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 1) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split4)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split4));
+	if (memcmp(buf, split4, strlen(split4))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the fourth split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 2);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 2\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(2);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the last chunk\n");
+		ret = -EINVAL;
+		goto out;
+	}
+out:
+	btrfs_free_path(path);
+	kfree(root);
+	return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+	test_msg("Running extent buffer operation tests");
+	return test_btrfs_split_item();
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 00000000000..7e99c2f98dd
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK		(1 << 0)
+#define PROCESS_RELEASE		(1 << 1)
+#define PROCESS_TEST_LOCKED	(1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+				       unsigned long flags)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int count = 0;
+	int loops = 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (flags & PROCESS_TEST_LOCKED &&
+			    !PageLocked(pages[i]))
+				count++;
+			if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+			if (flags & PROCESS_RELEASE)
+				page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+		loops++;
+		if (loops > 100000) {
+			printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+			break;
+		}
+	}
+	return count;
+}
+
+static int test_find_delalloc(void)
+{
+	struct inode *inode;
+	struct extent_io_tree tmp;
+	struct page *page;
+	struct page *locked_page = NULL;
+	unsigned long index = 0;
+	u64 total_dirty = 256 * 1024 * 1024;
+	u64 max_bytes = 128 * 1024 * 1024;
+	u64 start, end, test_start;
+	u64 found;
+	int ret = -EINVAL;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Failed to allocate test inode\n");
+		return -ENOMEM;
+	}
+
+	extent_io_tree_init(&tmp, &inode->i_data);
+
+	/*
+	 * First go through and create and mark all of our pages dirty, we pin
+	 * everything to make sure our pages don't get evicted and screw up our
+	 * test.
+	 */
+	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			test_msg("Failed to allocate test page\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+		SetPageDirty(page);
+		if (index) {
+			unlock_page(page);
+		} else {
+			page_cache_get(page);
+			locked_page = page;
+		}
+	}
+
+	/* Test this scenario
+	 * |--- delalloc ---|
+	 * |---  search  ---|
+	 */
+	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+	start = 0;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Should have found at least one delalloc\n");
+		goto out_bits;
+	}
+	if (start != 0 || end != 4095) {
+		test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+			 start, end);
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	unlock_page(locked_page);
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 *
+	 * |--- delalloc ---|
+	 *           |--- search ---|
+	 */
+	test_start = 64 * 1024 * 1024;
+	locked_page = find_lock_page(inode->i_mapping,
+				     test_start >> PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Couldn't find the locked page\n");
+		goto out_bits;
+	}
+	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Couldn't find delalloc in our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != max_bytes - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+			 "%Lu\n", test_start, max_bytes - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("There were unlocked pages in the range\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	/* locked_page was unlocked above */
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 * |--- delalloc ---|
+	 *                    |--- search ---|
+	 */
+	test_start = max_bytes + 4096;
+	locked_page = find_lock_page(inode->i_mapping, test_start >>
+				     PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Could'nt find the locked page\n");
+		goto out_bits;
+	}
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (found) {
+		test_msg("Found range when we shouldn't have\n");
+		goto out_bits;
+	}
+	if (end != (u64)-1) {
+		test_msg("Did not return the proper end offset\n");
+		goto out_bits;
+	}
+
+	/*
+	 * Test this scenario
+	 * [------- delalloc -------|
+	 * [max_bytes]|-- search--|
+	 *
+	 * We are re-using our test_start from above since it works out well.
+	 */
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != total_dirty - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, total_dirty - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+
+	/*
+	 * Now to test where we run into a page that is no longer dirty in the
+	 * range we want to find.
+	 */
+	page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+			     >> PAGE_CACHE_SHIFT);
+	if (!page) {
+		test_msg("Couldn't find our page\n");
+		goto out_bits;
+	}
+	ClearPageDirty(page);
+	page_cache_release(page);
+
+	/* We unlocked it in the previous test */
+	lock_page(locked_page);
+	start = test_start;
+	end = 0;
+	/*
+	 * Currently if we fail to find dirty pages in the delalloc range we
+	 * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search.  If
+	 * this changes at any point in the future we will need to fix this
+	 * tests expected behavior.
+	 */
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+			 end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+			       PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	ret = 0;
+out_bits:
+	clear_extent_bits(&tmp, 0, total_dirty - 1,
+			  (unsigned long)-1, GFP_NOFS);
+out:
+	if (locked_page)
+		page_cache_release(locked_page);
+	process_page_range(inode, 0, total_dirty - 1,
+			   PROCESS_UNLOCK | PROCESS_RELEASE);
+	iput(inode);
+	return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+	test_msg("Running find delalloc tests\n");
+	return test_find_delalloc();
+}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
new file mode 100644
index 00000000000..c8d9ddf84c6
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../free-space-cache.h"
+
+#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+static struct btrfs_block_group_cache *init_test_block_group(void)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = 0;
+	cache->key.offset = 1024 * 1024 * 1024;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = 4096;
+
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
+
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
+}
+
+/*
+ * This test just does basic sanity checking, making sure we can add an exten
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group_cache *cache)
+{
+	int ret = 0;
+
+	test_msg("Running extent only tests\n");
+
+	/* First just make sure we can remove an entire entry */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding initial extents %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing extent %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Full remove left some lingering space\n");
+		return -1;
+	}
+
+	/* Ok edge and middle cases now */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding half extent %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing tail end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing front end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+	if (ret) {
+		test_msg("Error removing middle piece %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the front\n");
+		return -1;
+	}
+
+	if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+		test_msg("Still have space in the middle\n");
+		return -1;
+	}
+
+	if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the end\n");
+		return -1;
+	}
+
+	/* Cleanup */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group_cache *cache)
+{
+	u64 next_bitmap_offset;
+	int ret;
+
+	test_msg("Running bitmap only tests\n");
+
+	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create a bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap full range %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Left some space in bitmap\n");
+		return -1;
+	}
+
+	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to our bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove middle chunk %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * The first bitmap we have starts at offset 0 so the next one is just
+	 * at the end of the first bitmap.
+	 */
+	next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+
+	/* Test a bit straddling two bitmaps */
+	ret = test_add_free_space_entry(cache, next_bitmap_offset -
+				   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space that straddles two bitmaps %d\n",
+				ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, next_bitmap_offset -
+				      (1 * 1024 * 1024), 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
+			 2 * 1024 * 1024)) {
+		test_msg("Left some space when removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+{
+	u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+	int ret;
+
+	test_msg("Running bitmap and extent tests\n");
+
+	/*
+	 * First let's do something simple, an extent at the same offset as the
+	 * bitmap, but the free space completely in the extent and then
+	 * completely in the bitmap.
+	 */
+	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove extent entry %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Left remnants after our remove\n");
+		return -1;
+	}
+
+	/* Now to add back the extent entry and remove from the bitmap */
+	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't re-add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove from bitmap %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Left remnants in the bitmap\n");
+		return -1;
+	}
+
+	/*
+	 * Ok so a little more evil, extent entry and bitmap at the same offset,
+	 * removing an overlapping chunk.
+	 */
+	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to a bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+		test_msg("Left over pieces after removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/* Now with the extent entry offset into the bitmap */
+	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space to the bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent to the cache %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Problem removing overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+		test_msg("Left something behind when removing space");
+		return -1;
+	}
+
+	/*
+	 * This has blown up in the past, the extent entry starts before the
+	 * bitmap entry, but we're trying to remove an offset that falls
+	 * completely within the bitmap range and is in both the extent entry
+	 * and the bitmap entry, looks like this
+	 *
+	 *   [ extent ]
+	 *      [ bitmap ]
+	 *        [ del ]
+	 */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
+				   4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
+				   5 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
+				      5 * 1024 * 1024);
+	if (ret) {
+		test_msg("Failed to free our space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
+			 5 * 1024 * 1024)) {
+		test_msg("Left stuff over\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/*
+	 * This blew up before, we have part of the free space in a bitmap and
+	 * then the entirety of the rest of the space in an extent.  This used
+	 * to return -EAGAIN back from btrfs_remove_extent, make sure this
+	 * doesn't happen.
+	 */
+	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+		return ret;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	return 0;
+}
+
+int btrfs_test_free_space_cache(void)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret;
+
+	test_msg("Running btrfs free space cache tests\n");
+
+	cache = init_test_block_group();
+	if (!cache) {
+		test_msg("Couldn't run the tests\n");
+		return 0;
+	}
+
+	ret = test_extents(cache);
+	if (ret)
+		goto out;
+	ret = test_bitmaps(cache);
+	if (ret)
+		goto out;
+	ret = test_bitmaps_and_extents(cache);
+	if (ret)
+		goto out;
+out:
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	kfree(cache->free_space_ctl);
+	kfree(cache);
+	test_msg("Free space cache tests finished\n");
+	return ret;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 00000000000..3ae0f5b8bb8
--- /dev/null
+++ b/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
+			  u64 disk_len, u32 type, u8 compression, int slot)
+{
+	struct btrfs_path path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		value_len += len;
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = slot;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, 1);
+	btrfs_set_file_extent_type(leaf, fi, type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+	btrfs_set_file_extent_offset(leaf, fi, offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, len);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+	struct btrfs_path path;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = 0;
+
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = 0;
+
+	key.objectid = BTRFS_INODE_ITEM_KEY;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen.  We want
+ * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 -  6][6 - 10][10 - 4096][  4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][    hole    ]
+ *
+ * [ 12288 - 20480][20480 - 24576][  24576 - 28672  ][28672 - 36864][36864 - 45056]
+ * [regular1 split][   prealloc1 ][prealloc1 written][   prealloc1 ][ compressed  ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][     65536+81920   ]
+ * [ compressed1 ][  regular  ][compressed1][  regular  ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [  regular  ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+	int slot = 0;
+	u64 disk_bytenr = 1 * 1024 * 1024;
+	u64 offset = 0;
+
+	/* First we want a hole */
+	insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 5;
+
+	/*
+	 * Now we want an inline extent, I don't think this is possible but hey
+	 * why not?  Also keep in mind if we have an inline extent it counts as
+	 * the whole first page.  If we were to expand it we would have to cow
+	 * and we wouldn't have an inline extent anymore.
+	 */
+	insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+		      slot);
+	slot++;
+	offset = 4096;
+
+	/* Now another hole */
+	insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 4;
+
+	/* Now for a regular extent */
+	insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	disk_bytenr += 4096;
+	offset += 4095;
+
+	/*
+	 * Now for 3 extents that were split from a hole punch so we test
+	 * offsets properly.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+		      0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now for a unwritten prealloc extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+
+	/*
+	 * We want to jack up disk_bytenr a little more so the em stuff doesn't
+	 * merge our records.
+	 */
+	disk_bytenr += 8192;
+
+	/*
+	 * Now for a partially written prealloc extent, basically the same as
+	 * the hole punch example above.  Ram_bytes never changes when you mark
+	 * extents written btw.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now a normal compressed extent */
+	insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	/* No merges */
+	disk_bytenr += 8192;
+
+	/* Now a split compressed extent */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 8192;
+
+	/* Now extents that have a hole but no hole extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 16384;
+	disk_bytenr += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	u64 orig_start;
+	u64 disk_bytenr;
+	u64 offset;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	/*
+	 * We do this since btrfs_get_extent wants to assign em->bdev to
+	 * root->fs_info->fs_devices->latest_bdev.
+	 */
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(0, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	/*
+	 * We will just free a dummy node if it's ref count is 2 so we need an
+	 * extra ref so our searches don't accidently release our page.
+	 */
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	ret = -EINVAL;
+
+	/* First with no extents */
+	BTRFS_I(inode)->root = root;
+	em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+	if (IS_ERR(em)) {
+		em = NULL;
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+		test_msg("Vacancy flag wasn't set properly\n");
+		goto out;
+	}
+	free_extent_map(em);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+	/*
+	 * All of the magic numbers are based on the mapping setup in
+	 * setup_file_extents, so if you change anything there you need to
+	 * update the comment and update the expected values below.
+	 */
+	setup_file_extents(root);
+
+	em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 5) {
+		test_msg("Unexpected extent wanted start 0 len 5, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_INLINE) {
+		test_msg("Expected an inline, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4091) {
+		test_msg("Unexpected extent wanted start %llu len 1, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	/*
+	 * We don't test anything else for inline since it doesn't get set
+	 * unless we have a page for it to write into.  Maybe we should change
+	 * this?
+	 */
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4) {
+		test_msg("Unexpected extent wanted start %llu len 4, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Regular extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4095) {
+		test_msg("Unexpected extent wanted start %llu len 4095, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are split extents */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	disk_bytenr += (em->start - orig_start);
+	if (em->block_start != disk_bytenr) {
+		test_msg("Wrong block start, want %llu, have %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are a half written prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_HOLE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+			 em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Now for the compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Split compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != disk_bytenr) {
+		test_msg("Block start does not match, want %llu got %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* A hole between regular extents but no hole extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	/*
+	 * Currently we just return a length that we requested rather than the
+	 * length of the actual hole, if this changes we'll have to change this
+	 * test.
+	 */
+	if (em->start != offset || em->len != 12288) {
+		test_msg("Unexpected extent wanted start %llu len 12288, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 vacancy_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
+static int test_hole_first(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(0, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	BTRFS_I(inode)->root = root;
+	ret = -EINVAL;
+
+	/*
+	 * Need a blank inode item here just so we don't confuse
+	 * btrfs_get_extent.
+	 */
+	insert_inode_item_key(root);
+	insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, 1);
+	em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 0 len 4096, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+			 em->flags);
+		goto out;
+	}
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != 4096) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 4096 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 4096 len 4096, got "
+			 "start %llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, wanted 0 got %lu\n",
+			 em->flags);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+	int ret;
+
+	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+	test_msg("Running btrfs_get_extent tests\n");
+	ret = test_btrfs_get_extent();
+	if (ret)
+		return ret;
+	test_msg("Running hole first btrfs_get_extent test\n");
+	return test_hole_first();
+}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 00000000000..ec3dcb20235
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (C) 2013 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static void init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+	memset(trans, 0, sizeof(*trans));
+	trans->transid = 1;
+	INIT_LIST_HEAD(&trans->qgroup_ref_list);
+	trans->type = __TRANS_DUMMY;
+}
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+				  u64 num_bytes, u64 parent, u64 root_objectid)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_tree_block_info *block_info;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	ins.objectid = bytenr;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ins.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+	if (ret) {
+		test_msg("Couldn't insert ref %d\n", ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, item, 1);
+	btrfs_set_extent_generation(leaf, item, 1);
+	btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+	block_info = (struct btrfs_tree_block_info *)(item + 1);
+	btrfs_set_tree_block_level(leaf, block_info, 1);
+	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	if (parent > 0) {
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_SHARED_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+			u64 parent, u64 root_objectid)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_extent_item *item;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 refs;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+	if (ret) {
+		test_msg("Couldn't find extent ref\n");
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_extent_item);
+	refs = btrfs_extent_refs(path->nodes[0], item);
+	btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+	btrfs_release_path(path);
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+	if (ret)
+		test_msg("Failed to insert backref\n");
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+			      u64 num_bytes)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+	if (ret) {
+		test_msg("Didn't find our key %d\n", ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+	btrfs_del_item(&trans, root, path);
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 parent, u64 root_objectid)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_extent_item *item;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 refs;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+	if (ret) {
+		test_msg("Couldn't find extent ref\n");
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_extent_item);
+	refs = btrfs_extent_refs(path->nodes[0], item);
+	btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+	btrfs_release_path(path);
+
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+	if (ret) {
+		test_msg("Couldn't find backref %d\n", ret);
+		btrfs_free_path(path);
+		return ret;
+	}
+	btrfs_del_item(&trans, root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	test_msg("Qgroup basic add\n");
+	ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+	if (ret) {
+		test_msg("Couldn't create a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	if (ret) {
+		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	ret = remove_extent_item(root, 4096, 4096);
+	if (ret)
+		return -EINVAL;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+				      BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+	if (ret) {
+		test_msg("Couldn't remove space from the qgroup %d\n", ret);
+		return -EINVAL;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Qgroup accounting failed %d\n", ret);
+		return -EINVAL;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	init_dummy_trans(&trans);
+
+	test_msg("Qgroup multiple refs test\n");
+
+	/* We have 5 created already from the previous test */
+	ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+	if (ret) {
+		test_msg("Couldn't create a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+	if (ret)
+		return ret;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	if (ret) {
+		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	ret = add_tree_ref(root, 4096, 4096, 0, 256);
+	if (ret)
+		return ret;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+				      BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+	if (ret) {
+		test_msg("Qgroup record ref failed %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+	if (ret)
+		return ret;
+
+	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+				      BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+	if (ret) {
+		test_msg("Qgroup record ref failed %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	if (ret) {
+		test_msg("Qgroup accounting failed %d\n", ret);
+		return ret;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+		test_msg("Qgroup counts didn't match expected values\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int btrfs_test_qgroups(void)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *tmp_root;
+	int ret = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		return PTR_ERR(root);
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Can't use bytenr 0, some things freak out
+	 * *cough*backref walking code*cough*
+	 */
+	root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_nritems(root->node, 0);
+	root->alloc_bytenr += 8192;
+
+	tmp_root = btrfs_alloc_dummy_root();
+	if (IS_ERR(tmp_root)) {
+		test_msg("Couldn't allocate a fs root\n");
+		ret = PTR_ERR(tmp_root);
+		goto out;
+	}
+
+	tmp_root->root_key.objectid = 5;
+	root->fs_info->fs_root = tmp_root;
+	ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+	if (ret) {
+		test_msg("Couldn't insert fs root %d\n", ret);
+		goto out;
+	}
+
+	tmp_root = btrfs_alloc_dummy_root();
+	if (IS_ERR(tmp_root)) {
+		test_msg("Couldn't allocate a fs root\n");
+		ret = PTR_ERR(tmp_root);
+		goto out;
+	}
+
+	tmp_root->root_key.objectid = 256;
+	ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+	if (ret) {
+		test_msg("Couldn't insert fs root %d\n", ret);
+		goto out;
+	}
+
+	/* We are using this root as our extent root */
+	root->fs_info->extent_root = root;
+
+	/*
+	 * Some of the paths we test assume we have a filled out fs_info, so we
+	 * just need to addt he root in there so we don't panic.
+	 */
+	root->fs_info->tree_root = root;
+	root->fs_info->quota_root = root;
+	root->fs_info->quota_enabled = 1;
+
+	test_msg("Running qgroup tests\n");
+	ret = test_no_shared_qgroup(root);
+	if (ret)
+		goto out;
+	ret = test_multiple_refs(root);
+out:
+	btrfs_free_dummy_root(root);
+	return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3d73c8d93bb..5f379affdf2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -22,74 +22,213 @@
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/uuid.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
 #include "tree-log.h"
+#include "inode-map.h"
+#include "volumes.h"
+#include "dev-replace.h"
+#include "qgroup.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
-static noinline void put_transaction(struct btrfs_transaction *transaction)
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+	[TRANS_STATE_RUNNING]		= 0U,
+	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START),
+	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH),
+	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN),
+	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+};
+
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
 {
-	WARN_ON(transaction->use_count == 0);
-	transaction->use_count--;
-	if (transaction->use_count == 0) {
-		list_del_init(&transaction->list);
-		memset(transaction, 0, sizeof(*transaction));
+	WARN_ON(atomic_read(&transaction->use_count) == 0);
+	if (atomic_dec_and_test(&transaction->use_count)) {
+		BUG_ON(!list_empty(&transaction->list));
+		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+		while (!list_empty(&transaction->pending_chunks)) {
+			struct extent_map *em;
+
+			em = list_first_entry(&transaction->pending_chunks,
+					      struct extent_map, list);
+			list_del_init(&em->list);
+			free_extent_map(em);
+		}
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
 }
 
-static noinline void switch_commit_root(struct btrfs_root *root)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root, *tmp;
+
+	down_write(&fs_info->commit_root_sem);
+	list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+				 dirty_list) {
+		list_del_init(&root->dirty_list);
+		free_extent_buffer(root->commit_root);
+		root->commit_root = btrfs_root_node(root);
+		if (is_fstree(root->objectid))
+			btrfs_unpin_free_ino(root);
+	}
+	up_write(&fs_info->commit_root_sem);
+}
+
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+					  unsigned int type)
+{
+	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
 {
-	free_extent_buffer(root->commit_root);
-	root->commit_root = btrfs_root_node(root);
+	return atomic_read(&trans->num_extwriters);
 }
 
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 {
 	struct btrfs_transaction *cur_trans;
-	cur_trans = root->fs_info->running_transaction;
-	if (!cur_trans) {
-		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
-					     GFP_NOFS);
-		BUG_ON(!cur_trans);
-		root->fs_info->generation++;
-		cur_trans->num_writers = 1;
-		cur_trans->num_joined = 0;
-		cur_trans->transid = root->fs_info->generation;
-		init_waitqueue_head(&cur_trans->writer_wait);
-		init_waitqueue_head(&cur_trans->commit_wait);
-		cur_trans->in_commit = 0;
-		cur_trans->blocked = 0;
-		cur_trans->use_count = 1;
-		cur_trans->commit_done = 0;
-		cur_trans->start_time = get_seconds();
-
-		cur_trans->delayed_refs.root = RB_ROOT;
-		cur_trans->delayed_refs.num_entries = 0;
-		cur_trans->delayed_refs.num_heads_ready = 0;
-		cur_trans->delayed_refs.num_heads = 0;
-		cur_trans->delayed_refs.flushing = 0;
-		cur_trans->delayed_refs.run_delayed_start = 0;
-		spin_lock_init(&cur_trans->delayed_refs.lock);
-
-		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-		extent_io_tree_init(&cur_trans->dirty_pages,
-				     root->fs_info->btree_inode->i_mapping,
-				     GFP_NOFS);
-		spin_lock(&root->fs_info->new_trans_lock);
-		root->fs_info->running_transaction = cur_trans;
-		spin_unlock(&root->fs_info->new_trans_lock);
-	} else {
-		cur_trans->num_writers++;
-		cur_trans->num_joined++;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	spin_lock(&fs_info->trans_lock);
+loop:
+	/* The file system has been taken offline. No new transactions. */
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		spin_unlock(&fs_info->trans_lock);
+		return -EROFS;
+	}
+
+	cur_trans = fs_info->running_transaction;
+	if (cur_trans) {
+		if (cur_trans->aborted) {
+			spin_unlock(&fs_info->trans_lock);
+			return cur_trans->aborted;
+		}
+		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
+			spin_unlock(&fs_info->trans_lock);
+			return -EBUSY;
+		}
+		atomic_inc(&cur_trans->use_count);
+		atomic_inc(&cur_trans->num_writers);
+		extwriter_counter_inc(cur_trans, type);
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
 	}
+	spin_unlock(&fs_info->trans_lock);
+
+	/*
+	 * If we are ATTACH, we just want to catch the current transaction,
+	 * and commit it. If there is no transaction, just return ENOENT.
+	 */
+	if (type == TRANS_ATTACH)
+		return -ENOENT;
+
+	/*
+	 * JOIN_NOLOCK only happens during the transaction commit, so
+	 * it is impossible that ->running_transaction is NULL
+	 */
+	BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+	if (!cur_trans)
+		return -ENOMEM;
+
+	spin_lock(&fs_info->trans_lock);
+	if (fs_info->running_transaction) {
+		/*
+		 * someone started a transaction after we unlocked.  Make sure
+		 * to redo the checks above
+		 */
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		goto loop;
+	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		spin_unlock(&fs_info->trans_lock);
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		return -EROFS;
+	}
+
+	atomic_set(&cur_trans->num_writers, 1);
+	extwriter_counter_init(cur_trans, type);
+	init_waitqueue_head(&cur_trans->writer_wait);
+	init_waitqueue_head(&cur_trans->commit_wait);
+	cur_trans->state = TRANS_STATE_RUNNING;
+	/*
+	 * One for this trans handle, one so it will live on until we
+	 * commit the transaction.
+	 */
+	atomic_set(&cur_trans->use_count, 2);
+	cur_trans->start_time = get_seconds();
+
+	cur_trans->delayed_refs.href_root = RB_ROOT;
+	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
+	cur_trans->delayed_refs.num_heads_ready = 0;
+	cur_trans->delayed_refs.num_heads = 0;
+	cur_trans->delayed_refs.flushing = 0;
+	cur_trans->delayed_refs.run_delayed_start = 0;
+
+	/*
+	 * although the tree mod log is per file system and not per transaction,
+	 * the log must never go across transaction boundaries.
+	 */
+	smp_mb();
+	if (!list_empty(&fs_info->tree_mod_seq_list))
+		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
+			"creating a fresh transaction\n");
+	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
+			"creating a fresh transaction\n");
+	atomic64_set(&fs_info->tree_mod_seq, 0);
+
+	spin_lock_init(&cur_trans->delayed_refs.lock);
+
+	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+	INIT_LIST_HEAD(&cur_trans->ordered_operations);
+	INIT_LIST_HEAD(&cur_trans->pending_chunks);
+	INIT_LIST_HEAD(&cur_trans->switch_commits);
+	list_add_tail(&cur_trans->list, &fs_info->trans_list);
+	extent_io_tree_init(&cur_trans->dirty_pages,
+			     fs_info->btree_inode->i_mapping);
+	fs_info->generation++;
+	cur_trans->transid = fs_info->generation;
+	fs_info->running_transaction = cur_trans;
+	cur_trans->aborted = 0;
+	spin_unlock(&fs_info->trans_lock);
 
 	return 0;
 }
@@ -100,39 +239,93 @@ static noinline int join_transaction(struct btrfs_root *root)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root)
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
 {
-	if (root->ref_cows && root->last_trans < trans->transid) {
+	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+	    root->last_trans < trans->transid) {
 		WARN_ON(root == root->fs_info->extent_root);
 		WARN_ON(root->commit_root != root->node);
 
+		/*
+		 * see below for IN_TRANS_SETUP usage rules
+		 * we have the reloc mutex held now, so there
+		 * is only one writer in this function
+		 */
+		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+
+		/* make sure readers find IN_TRANS_SETUP before
+		 * they find our root->last_trans update
+		 */
+		smp_wmb();
+
+		spin_lock(&root->fs_info->fs_roots_radix_lock);
+		if (root->last_trans == trans->transid) {
+			spin_unlock(&root->fs_info->fs_roots_radix_lock);
+			return 0;
+		}
 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 			   (unsigned long)root->root_key.objectid,
 			   BTRFS_ROOT_TRANS_TAG);
+		spin_unlock(&root->fs_info->fs_roots_radix_lock);
 		root->last_trans = trans->transid;
+
+		/* this is pretty tricky.  We don't want to
+		 * take the relocation lock in btrfs_record_root_in_trans
+		 * unless we're really doing the first setup for this root in
+		 * this transaction.
+		 *
+		 * Normally we'd use root->last_trans as a flag to decide
+		 * if we want to take the expensive mutex.
+		 *
+		 * But, we have to set root->last_trans before we
+		 * init the relocation root, otherwise, we trip over warnings
+		 * in ctree.c.  The solution used here is to flag ourselves
+		 * with root IN_TRANS_SETUP.  When this is 1, we're still
+		 * fixing up the reloc trees and everyone must wait.
+		 *
+		 * When this is zero, they can trust root->last_trans and fly
+		 * through btrfs_record_root_in_trans without having to take the
+		 * lock.  smp_wmb() makes sure that all the writes above are
+		 * done before we pop in the zero below
+		 */
 		btrfs_init_reloc_root(trans, root);
+		smp_mb__before_atomic();
+		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
 	}
 	return 0;
 }
 
+
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
-	if (!root->ref_cows)
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		return 0;
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (root->last_trans == trans->transid) {
-		mutex_unlock(&root->fs_info->trans_mutex);
+	/*
+	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
+	 * and barriers
+	 */
+	smp_rmb();
+	if (root->last_trans == trans->transid &&
+	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
 		return 0;
-	}
 
+	mutex_lock(&root->fs_info->reloc_mutex);
 	record_root_in_trans(trans, root);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	mutex_unlock(&root->fs_info->reloc_mutex);
+
 	return 0;
 }
 
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+	return (trans->state >= TRANS_STATE_BLOCKED &&
+		trans->state < TRANS_STATE_UNBLOCKED &&
+		!trans->aborted);
+}
+
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
@@ -141,257 +334,339 @@ static void wait_current_trans(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
 
+	spin_lock(&root->fs_info->trans_lock);
 	cur_trans = root->fs_info->running_transaction;
-	if (cur_trans && cur_trans->blocked) {
-		DEFINE_WAIT(wait);
-		cur_trans->use_count++;
-		while (1) {
-			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
-					TASK_UNINTERRUPTIBLE);
-			if (!cur_trans->blocked)
-				break;
-			mutex_unlock(&root->fs_info->trans_mutex);
-			schedule();
-			mutex_lock(&root->fs_info->trans_mutex);
-		}
-		finish_wait(&root->fs_info->transaction_wait, &wait);
-		put_transaction(cur_trans);
+	if (cur_trans && is_transaction_blocked(cur_trans)) {
+		atomic_inc(&cur_trans->use_count);
+		spin_unlock(&root->fs_info->trans_lock);
+
+		wait_event(root->fs_info->transaction_wait,
+			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+			   cur_trans->aborted);
+		btrfs_put_transaction(cur_trans);
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
 	}
 }
 
-enum btrfs_trans_type {
-	TRANS_START,
-	TRANS_JOIN,
-	TRANS_USERSPACE,
-	TRANS_JOIN_NOLOCK,
-};
-
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-	if (!root->fs_info->log_root_recovering &&
-	    ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-	     type == TRANS_USERSPACE))
+	if (root->fs_info->log_root_recovering)
+		return 0;
+
+	if (type == TRANS_USERSPACE)
+		return 1;
+
+	if (type == TRANS_START &&
+	    !atomic_read(&root->fs_info->open_ioctl_trans))
 		return 1;
+
 	return 0;
 }
 
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-						    u64 num_items, int type)
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+	if (!root->fs_info->reloc_ctl ||
+	    !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+	    root->reloc_root)
+		return false;
+
+	return true;
+}
+
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
+		  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
+	u64 num_bytes = 0;
+	u64 qgroup_reserved = 0;
+	bool reloc_reserved = false;
 	int ret;
 
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+	/* Send isn't supposed to start transactions. */
+	ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return ERR_PTR(-EROFS);
+
+	if (current->journal_info) {
+		WARN_ON(type & TRANS_EXTWRITERS);
+		h = current->journal_info;
+		h->use_count++;
+		WARN_ON(h->use_count > 2);
+		h->orig_rsv = h->block_rsv;
+		h->block_rsv = NULL;
+		goto got_it;
+	}
+
+	/*
+	 * Do the reservation before we join the transaction so we can do all
+	 * the appropriate flushing if need be.
+	 */
+	if (num_items > 0 && root != root->fs_info->chunk_root) {
+		if (root->fs_info->quota_enabled &&
+		    is_fstree(root->root_key.objectid)) {
+			qgroup_reserved = num_items * root->leafsize;
+			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+			if (ret)
+				return ERR_PTR(ret);
+		}
+
+		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+		/*
+		 * Do the reservation for the relocation root creation
+		 */
+		if (unlikely(need_reserve_reloc_root(root))) {
+			num_bytes += root->nodesize;
+			reloc_reserved = true;
+		}
+
+		ret = btrfs_block_rsv_add(root,
+					  &root->fs_info->trans_block_rsv,
+					  num_bytes, flush);
+		if (ret)
+			goto reserve_fail;
+	}
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
-	if (!h)
-		return ERR_PTR(-ENOMEM);
+	if (!h) {
+		ret = -ENOMEM;
+		goto alloc_fail;
+	}
+
+	/*
+	 * If we are JOIN_NOLOCK we're already committing a transaction and
+	 * waiting on this guy, so we don't need to do the sb_start_intwrite
+	 * because we're already holding a ref.  We need this because we could
+	 * have raced in and did an fsync() on a file which can kick a commit
+	 * and then we deadlock with somebody doing a freeze.
+	 *
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
+	 */
+	if (type & __TRANS_FREEZABLE)
+		sb_start_intwrite(root->fs_info->sb);
 
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_lock(&root->fs_info->trans_mutex);
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
-	ret = join_transaction(root);
-	BUG_ON(ret);
+	do {
+		ret = join_transaction(root, type);
+		if (ret == -EBUSY) {
+			wait_current_trans(root);
+			if (unlikely(type == TRANS_ATTACH))
+				ret = -ENOENT;
+		}
+	} while (ret == -EBUSY);
+
+	if (ret < 0) {
+		/* We must get the transaction if we are JOIN_NOLOCK. */
+		BUG_ON(type == TRANS_JOIN_NOLOCK);
+		goto join_fail;
+	}
 
 	cur_trans = root->fs_info->running_transaction;
-	cur_trans->use_count++;
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_unlock(&root->fs_info->trans_mutex);
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
 	h->blocks_used = 0;
-	h->block_group = 0;
 	h->bytes_reserved = 0;
+	h->root = root;
 	h->delayed_ref_updates = 0;
+	h->use_count = 1;
+	h->adding_csums = 0;
 	h->block_rsv = NULL;
+	h->orig_rsv = NULL;
+	h->aborted = 0;
+	h->qgroup_reserved = 0;
+	h->delayed_ref_elem.seq = 0;
+	h->type = type;
+	h->allocating_chunk = false;
+	h->reloc_reserved = false;
+	h->sync = false;
+	INIT_LIST_HEAD(&h->qgroup_ref_list);
+	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
-	if (cur_trans->blocked && may_wait_transaction(root, type)) {
+	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+	    may_wait_transaction(root, type)) {
+		current->journal_info = h;
 		btrfs_commit_transaction(h, root);
 		goto again;
 	}
 
-	if (num_items > 0) {
-		ret = btrfs_trans_reserve_metadata(h, root, num_items);
-		if (ret == -EAGAIN) {
-			btrfs_commit_transaction(h, root);
-			goto again;
-		}
-		if (ret < 0) {
-			btrfs_end_transaction(h, root);
-			return ERR_PTR(ret);
-		}
+	if (num_bytes) {
+		trace_btrfs_space_reservation(root->fs_info, "transaction",
+					      h->transid, num_bytes, 1);
+		h->block_rsv = &root->fs_info->trans_block_rsv;
+		h->bytes_reserved = num_bytes;
+		h->reloc_reserved = reloc_reserved;
 	}
+	h->qgroup_reserved = qgroup_reserved;
 
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_lock(&root->fs_info->trans_mutex);
-	record_root_in_trans(h, root);
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_unlock(&root->fs_info->trans_mutex);
+got_it:
+	btrfs_record_root_in_trans(h, root);
 
 	if (!current->journal_info && type != TRANS_USERSPACE)
 		current->journal_info = h;
 	return h;
+
+join_fail:
+	if (type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+	kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+	if (num_bytes)
+		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+					num_bytes);
+reserve_fail:
+	if (qgroup_reserved)
+		btrfs_qgroup_free(root, qgroup_reserved);
+	return ERR_PTR(ret);
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items)
 {
-	return start_transaction(root, num_items, TRANS_START);
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_ALL);
 }
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-						   int num_blocks)
+
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+					struct btrfs_root *root, int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_LIMIT);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN);
+	return start_transaction(root, 0, TRANS_JOIN, 0);
 }
 
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
-							  int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
 }
 
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-							 int num_blocks)
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-	return start_transaction(r, 0, TRANS_USERSPACE);
+	return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ *     btrfs_attach_transaction_barrier()
+ */
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_ATTACH, 0);
+}
+
+/*
+ * btrfs_attach_transaction_barrier() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+
+	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+		btrfs_wait_for_commit(root, 0);
+
+	return trans;
 }
 
 /* wait for a transaction commit to be fully complete */
-static noinline int wait_for_commit(struct btrfs_root *root,
+static noinline void wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
-	DEFINE_WAIT(wait);
-	mutex_lock(&root->fs_info->trans_mutex);
-	while (!commit->commit_done) {
-		prepare_to_wait(&commit->commit_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (commit->commit_done)
-			break;
-		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	mutex_unlock(&root->fs_info->trans_mutex);
-	finish_wait(&commit->commit_wait, &wait);
-	return 0;
+	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
 }
 
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 {
 	struct btrfs_transaction *cur_trans = NULL, *t;
-	int ret;
-
-	mutex_lock(&root->fs_info->trans_mutex);
+	int ret = 0;
 
-	ret = 0;
 	if (transid) {
 		if (transid <= root->fs_info->last_trans_committed)
-			goto out_unlock;
+			goto out;
 
+		ret = -EINVAL;
 		/* find specified transaction */
+		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry(t, &root->fs_info->trans_list, list) {
 			if (t->transid == transid) {
 				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
+				ret = 0;
 				break;
 			}
-			if (t->transid > transid)
+			if (t->transid > transid) {
+				ret = 0;
 				break;
+			}
 		}
-		ret = -EINVAL;
+		spin_unlock(&root->fs_info->trans_lock);
+		/* The specified transaction doesn't exist */
 		if (!cur_trans)
-			goto out_unlock;  /* bad transid */
+			goto out;
 	} else {
 		/* find newest transaction that is committing | committed */
+		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
 					    list) {
-			if (t->in_commit) {
-				if (t->commit_done)
-					goto out_unlock;
+			if (t->state >= TRANS_STATE_COMMIT_START) {
+				if (t->state == TRANS_STATE_COMPLETED)
+					break;
 				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
 				break;
 			}
 		}
+		spin_unlock(&root->fs_info->trans_lock);
 		if (!cur_trans)
-			goto out_unlock;  /* nothing committing|committed */
+			goto out;  /* nothing committing|committed */
 	}
 
-	cur_trans->use_count++;
-	mutex_unlock(&root->fs_info->trans_mutex);
-
 	wait_for_commit(root, cur_trans);
-
-	mutex_lock(&root->fs_info->trans_mutex);
-	put_transaction(cur_trans);
-	ret = 0;
-out_unlock:
-	mutex_unlock(&root->fs_info->trans_mutex);
+	btrfs_put_transaction(cur_trans);
+out:
 	return ret;
 }
 
-#if 0
-/*
- * rate limit against the drop_snapshot code.  This helps to slow down new
- * operations if the drop_snapshot code isn't able to keep up.
- */
-static void throttle_on_drops(struct btrfs_root *root)
-{
-	struct btrfs_fs_info *info = root->fs_info;
-	int harder_count = 0;
-
-harder:
-	if (atomic_read(&info->throttles)) {
-		DEFINE_WAIT(wait);
-		int thr;
-		thr = atomic_read(&info->throttle_gen);
-
-		do {
-			prepare_to_wait(&info->transaction_throttle,
-					&wait, TASK_UNINTERRUPTIBLE);
-			if (!atomic_read(&info->throttles)) {
-				finish_wait(&info->transaction_throttle, &wait);
-				break;
-			}
-			schedule();
-			finish_wait(&info->transaction_throttle, &wait);
-		} while (thr == atomic_read(&info->throttle_gen));
-		harder_count++;
-
-		if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
-		    harder_count < 2)
-			goto harder;
-
-		if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
-		    harder_count < 10)
-			goto harder;
-
-		if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
-		    harder_count < 20)
-			goto harder;
-	}
-}
-#endif
-
 void btrfs_throttle(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (!root->fs_info->open_ioctl_trans)
+	if (!atomic_read(&root->fs_info->open_ioctl_trans))
 		wait_current_trans(root);
-	mutex_unlock(&root->fs_info->trans_mutex);
 }
 
 static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-	int ret;
-	ret = btrfs_block_rsv_check(trans, root,
-				    &root->fs_info->global_block_rsv, 0, 5);
-	return ret ? 1 : 0;
+	if (root->fs_info->global_block_rsv.space_info->full &&
+	    btrfs_check_space_for_delayed_refs(trans, root))
+		return 1;
+
+	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
 }
 
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -399,98 +674,136 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	int updates;
+	int err;
 
-	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+	smp_mb();
+	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+	    cur_trans->delayed_refs.flushing)
 		return 1;
 
 	updates = trans->delayed_ref_updates;
 	trans->delayed_ref_updates = 0;
-	if (updates)
-		btrfs_run_delayed_refs(trans, root, updates);
+	if (updates) {
+		err = btrfs_run_delayed_refs(trans, root, updates);
+		if (err) /* Error code will also eval true */
+			return err;
+	}
 
 	return should_end_transaction(trans, root);
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, int throttle, int lock)
+			  struct btrfs_root *root, int throttle)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
-	int count = 0;
-
-	while (count < 4) {
-		unsigned long cur = trans->delayed_ref_updates;
-		trans->delayed_ref_updates = 0;
-		if (cur &&
-		    trans->transaction->delayed_refs.num_heads_ready > 64) {
-			trans->delayed_ref_updates = 0;
-
-			/*
-			 * do a full flush if the transaction is trying
-			 * to close
-			 */
-			if (trans->transaction->delayed_refs.flushing)
-				cur = 0;
-			btrfs_run_delayed_refs(trans, root, cur);
-		} else {
-			break;
-		}
-		count++;
+	unsigned long cur = trans->delayed_ref_updates;
+	int lock = (trans->type != TRANS_JOIN_NOLOCK);
+	int err = 0;
+	int must_run_delayed_refs = 0;
+
+	if (trans->use_count > 1) {
+		trans->use_count--;
+		trans->block_rsv = trans->orig_rsv;
+		return 0;
 	}
 
 	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
 
-	if (lock && !root->fs_info->open_ioctl_trans &&
-	    should_end_transaction(trans, root))
-		trans->transaction->blocked = 1;
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
 
-	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+	trans->delayed_ref_updates = 0;
+	if (!trans->sync) {
+		must_run_delayed_refs =
+			btrfs_should_throttle_delayed_refs(trans, root);
+		cur = max_t(unsigned long, cur, 32);
+
+		/*
+		 * don't make the caller wait if they are from a NOLOCK
+		 * or ATTACH transaction, it will deadlock with commit
+		 */
+		if (must_run_delayed_refs == 1 &&
+		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
+			must_run_delayed_refs = 2;
+	}
+
+	if (trans->qgroup_reserved) {
+		/*
+		 * the same root has to be passed here between start_transaction
+		 * and end_transaction. Subvolume quota depends on this.
+		 */
+		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
+	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+	    should_end_transaction(trans, root) &&
+	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+		spin_lock(&info->trans_lock);
+		if (cur_trans->state == TRANS_STATE_RUNNING)
+			cur_trans->state = TRANS_STATE_BLOCKED;
+		spin_unlock(&info->trans_lock);
+	}
+
+	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
 		if (throttle)
 			return btrfs_commit_transaction(trans, root);
 		else
 			wake_up_process(info->transaction_kthread);
 	}
 
-	if (lock)
-		mutex_lock(&info->trans_mutex);
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+
 	WARN_ON(cur_trans != info->running_transaction);
-	WARN_ON(cur_trans->num_writers < 1);
-	cur_trans->num_writers--;
+	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
+	atomic_dec(&cur_trans->num_writers);
+	extwriter_counter_dec(cur_trans, trans->type);
 
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-	put_transaction(cur_trans);
-	if (lock)
-		mutex_unlock(&info->trans_mutex);
+	btrfs_put_transaction(cur_trans);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
-	memset(trans, 0, sizeof(*trans));
-	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (throttle)
 		btrfs_run_delayed_iputs(root);
 
-	return 0;
+	if (trans->aborted ||
+	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+		wake_up_process(info->transaction_kthread);
+		err = -EIO;
+	}
+	assert_qgroups_uptodate(trans);
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+	if (must_run_delayed_refs) {
+		btrfs_async_run_delayed_refs(root, cur,
+					     must_run_delayed_refs == 1);
+	}
+	return err;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 0, 1);
+	return __btrfs_end_transaction(trans, root, 0);
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 1, 1);
-}
-
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root)
-{
-	return __btrfs_end_transaction(trans, root, 0, 0);
+	return __btrfs_end_transaction(trans, root, 1);
 }
 
 /*
@@ -501,50 +814,23 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 int btrfs_write_marked_extents(struct btrfs_root *root,
 			       struct extent_io_tree *dirty_pages, int mark)
 {
-	int ret;
 	int err = 0;
 	int werr = 0;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
-	unsigned long index;
-
-	while (1) {
-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
-		if (ret)
-			break;
-		while (start <= end) {
-			cond_resched();
-
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
-				continue;
-
-			btree_lock_page_hook(page);
-			if (!page->mapping) {
-				unlock_page(page);
-				page_cache_release(page);
-				continue;
-			}
 
-			if (PageWriteback(page)) {
-				if (PageDirty(page))
-					wait_on_page_writeback(page);
-				else {
-					unlock_page(page);
-					page_cache_release(page);
-					continue;
-				}
-			}
-			err = write_one_page(page, 0);
-			if (err)
-				werr = err;
-			page_cache_release(page);
-		}
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      mark, &cached_state)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				   mark, &cached_state, GFP_NOFS);
+		cached_state = NULL;
+		err = filemap_fdatawrite_range(mapping, start, end);
+		if (err)
+			werr = err;
+		cond_resched();
+		start = end + 1;
 	}
 	if (err)
 		werr = err;
@@ -560,39 +846,22 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 int btrfs_wait_marked_extents(struct btrfs_root *root,
 			      struct extent_io_tree *dirty_pages, int mark)
 {
-	int ret;
 	int err = 0;
 	int werr = 0;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
-	unsigned long index;
-
-	while (1) {
-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
-		if (ret)
-			break;
 
-		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
-		while (start <= end) {
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
-				continue;
-			if (PageDirty(page)) {
-				btree_lock_page_hook(page);
-				wait_on_page_writeback(page);
-				err = write_one_page(page, 0);
-				if (err)
-					werr = err;
-			}
-			wait_on_page_writeback(page);
-			page_cache_release(page);
-			cond_resched();
-		}
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      EXTENT_NEED_WAIT, &cached_state)) {
+		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				 0, 0, &cached_state, GFP_NOFS);
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (err)
+			werr = err;
+		cond_resched();
+		start = end + 1;
 	}
 	if (err)
 		werr = err;
@@ -604,15 +873,23 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
  * them in one of two extent_io trees.  This is used to make sure all of
  * those extents are on disk for transaction or log commit
  */
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark)
 {
 	int ret;
 	int ret2;
+	struct blk_plug plug;
 
+	blk_start_plug(&plug);
 	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+	blk_finish_plug(&plug);
 	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
-	return ret || ret2;
+
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
 }
 
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -659,21 +936,24 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 
 		old_root_used = btrfs_root_used(&root->root_item);
 		ret = btrfs_write_dirty_block_groups(trans, root);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
-	if (root != root->fs_info->extent_root)
-		switch_commit_root(root);
-
 	return 0;
 }
 
 /*
  * update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
  */
 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root)
@@ -684,27 +964,53 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 	int ret;
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	eb = btrfs_lock_root_node(fs_info->tree_root);
-	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+			      0, &eb);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
+	if (ret)
+		return ret;
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret)
+		return ret;
+
+	ret = btrfs_run_dev_stats(trans, root->fs_info);
+	if (ret)
+		return ret;
+	ret = btrfs_run_dev_replace(trans, root->fs_info);
+	if (ret)
+		return ret;
+	ret = btrfs_run_qgroups(trans, root->fs_info);
+	if (ret)
+		return ret;
+
+	/* run_qgroups might have added some more refs */
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 
-		update_cowonly_root(trans, root);
+		if (root != fs_info->extent_root)
+			list_add_tail(&root->dirty_list,
+				      &trans->transaction->switch_commits);
+		ret = update_cowonly_root(trans, root);
+		if (ret)
+			return ret;
 	}
 
-	down_write(&fs_info->extent_commit_sem);
-	switch_commit_root(fs_info->extent_root);
-	up_write(&fs_info->extent_commit_sem);
+	list_add_tail(&fs_info->extent_root->dirty_list,
+		      &trans->transaction->switch_commits);
+	btrfs_after_dev_replace_commit(fs_info);
 
 	return 0;
 }
@@ -714,12 +1020,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
  * a dirty root struct and adds it into the list of dead roots that need to
  * be deleted
  */
-int btrfs_add_dead_root(struct btrfs_root *root)
+void btrfs_add_dead_root(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->trans_mutex);
-	list_add(&root->root_list, &root->fs_info->dead_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
-	return 0;
+	spin_lock(&root->fs_info->trans_lock);
+	if (list_empty(&root->root_list))
+		list_add_tail(&root->root_list, &root->fs_info->dead_roots);
+	spin_unlock(&root->fs_info->trans_lock);
 }
 
 /*
@@ -734,6 +1040,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 
+	spin_lock(&fs_info->fs_roots_radix_lock);
 	while (1) {
 		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
 						 (void **)gang, 0,
@@ -746,13 +1053,21 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			radix_tree_tag_clear(&fs_info->fs_roots_radix,
 					(unsigned long)root->root_key.objectid,
 					BTRFS_ROOT_TRANS_TAG);
+			spin_unlock(&fs_info->fs_roots_radix_lock);
 
 			btrfs_free_log(trans, root);
 			btrfs_update_reloc_root(trans, root);
 			btrfs_orphan_commit_root(trans, root);
 
+			btrfs_save_ino_cache(root, trans);
+
+			/* see comments in should_cow_block() */
+			clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+			smp_mb__after_atomic();
+
 			if (root->commit_root != root->node) {
-				switch_commit_root(root);
+				list_add_tail(&root->dirty_list,
+					&trans->transaction->switch_commits);
 				btrfs_set_root_node(&root->root_item,
 						    root->node);
 			}
@@ -760,25 +1075,26 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			err = btrfs_update_root(trans, fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
+			spin_lock(&fs_info->fs_roots_radix_lock);
 			if (err)
 				break;
 		}
 	}
+	spin_unlock(&fs_info->fs_roots_radix_lock);
 	return err;
 }
 
 /*
- * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
- * otherwise every leaf in the btree is read and defragged.
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
  */
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	int ret;
-	unsigned long nr;
 
-	if (xchg(&root->defrag_running, 1))
+	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
 		return 0;
 
 	while (1) {
@@ -786,114 +1102,33 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
-		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+		ret = btrfs_defrag_leaves(trans, root);
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(info->tree_root, nr);
+		btrfs_btree_balance_dirty(info->tree_root);
 		cond_resched();
 
-		if (root->fs_info->closing || ret != -EAGAIN)
+		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
 			break;
-	}
-	root->defrag_running = 0;
-	return ret;
-}
 
-#if 0
-/*
- * when dropping snapshots, we generate a ton of delayed refs, and it makes
- * sense not to join the transaction while it is trying to flush the current
- * queue of delayed refs out.
- *
- * This is used by the drop snapshot code only
- */
-static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
-{
-	DEFINE_WAIT(wait);
-
-	mutex_lock(&info->trans_mutex);
-	while (info->running_transaction &&
-	       info->running_transaction->delayed_refs.flushing) {
-		prepare_to_wait(&info->transaction_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&info->trans_mutex);
-
-		schedule();
-
-		mutex_lock(&info->trans_mutex);
-		finish_wait(&info->transaction_wait, &wait);
-	}
-	mutex_unlock(&info->trans_mutex);
-	return 0;
-}
-
-/*
- * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
- * all of them
- */
-int btrfs_drop_dead_root(struct btrfs_root *root)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *tree_root = root->fs_info->tree_root;
-	unsigned long nr;
-	int ret;
-
-	while (1) {
-		/*
-		 * we don't want to jump in and create a bunch of
-		 * delayed refs if the transaction is starting to close
-		 */
-		wait_transaction_pre_flush(tree_root->fs_info);
-		trans = btrfs_start_transaction(tree_root, 1);
-
-		/*
-		 * we've joined a transaction, make sure it isn't
-		 * closing right now
-		 */
-		if (trans->transaction->delayed_refs.flushing) {
-			btrfs_end_transaction(trans, tree_root);
-			continue;
-		}
-
-		ret = btrfs_drop_snapshot(trans, root);
-		if (ret != -EAGAIN)
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			pr_debug("BTRFS: defrag_root cancelled\n");
+			ret = -EAGAIN;
 			break;
-
-		ret = btrfs_update_root(trans, tree_root,
-					&root->root_key,
-					&root->root_item);
-		if (ret)
-			break;
-
-		nr = trans->blocks_used;
-		ret = btrfs_end_transaction(trans, tree_root);
-		BUG_ON(ret);
-
-		btrfs_btree_balance_dirty(tree_root, nr);
-		cond_resched();
+		}
 	}
-	BUG_ON(ret);
-
-	ret = btrfs_del_root(trans, tree_root, &root->root_key);
-	BUG_ON(ret);
-
-	nr = trans->blocks_used;
-	ret = btrfs_end_transaction(trans, tree_root);
-	BUG_ON(ret);
-
-	free_extent_buffer(root->node);
-	free_extent_buffer(root->commit_root);
-	kfree(root);
-
-	btrfs_btree_balance_dirty(tree_root, nr);
+	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
 	return ret;
 }
-#endif
 
 /*
  * new snapshots need to be created at a very specific time in the
- * transaction commit.  This does the actual creation
+ * transaction commit.  This does the actual creation.
+ *
+ * Note:
+ * If the error which may affect the commitment of the current transaction
+ * happens, we should return the error number. If the error which just affect
+ * the creation of the pending snapshots, just return 0.
  */
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
@@ -904,50 +1139,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root = pending->root;
 	struct btrfs_root *parent_root;
+	struct btrfs_block_rsv *rsv;
 	struct inode *parent_inode;
-	struct dentry *parent;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
 	struct dentry *dentry;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
-	int ret;
+	struct timespec cur_time = CURRENT_TIME;
+	int ret = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
 	u64 root_flags;
+	uuid_le new_uuid;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		pending->error = -ENOMEM;
+		return 0;
+	}
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
 		pending->error = -ENOMEM;
-		goto fail;
+		goto root_item_alloc_fail;
 	}
 
-	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
-	if (ret) {
-		pending->error = ret;
-		goto fail;
-	}
+	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+	if (pending->error)
+		goto no_free_objectid;
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
-	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-					  to_reserve);
-		if (ret) {
-			pending->error = ret;
-			goto fail;
-		}
+		pending->error = btrfs_block_rsv_add(root,
+						     &pending->block_rsv,
+						     to_reserve,
+						     BTRFS_RESERVE_NO_FLUSH);
+		if (pending->error)
+			goto no_free_objectid;
 	}
 
 	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 
+	rsv = trans->block_rsv;
 	trans->block_rsv = &pending->block_rsv;
+	trans->bytes_reserved = trans->block_rsv->reserved;
 
 	dentry = pending->dentry;
-	parent = dget_parent(dentry);
-	parent_inode = parent->d_inode;
+	parent_inode = pending->dir;
 	parent_root = BTRFS_I(parent_inode)->root;
 	record_root_in_trans(trans, parent_root);
 
@@ -955,21 +1198,39 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * insert the directory item
 	 */
 	ret = btrfs_set_inode_index(parent_inode, &index);
-	BUG_ON(ret);
-	ret = btrfs_insert_dir_item(trans, parent_root,
-				dentry->d_name.name, dentry->d_name.len,
-				parent_inode->i_ino, &key,
-				BTRFS_FT_DIR, index);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
+
+	/* check if there is a file/dir which has the same name. */
+	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+					 btrfs_ino(parent_inode),
+					 dentry->d_name.name,
+					 dentry->d_name.len, 0);
+	if (dir_item != NULL && !IS_ERR(dir_item)) {
+		pending->error = -EEXIST;
+		goto dir_item_existed;
+	} else if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	btrfs_release_path(path);
 
-	btrfs_i_size_write(parent_inode, parent_inode->i_size +
-					 dentry->d_name.len * 2);
-	ret = btrfs_update_inode(trans, parent_root, parent_inode);
-	BUG_ON(ret);
+	/*
+	 * pull in the delayed directory update
+	 * and the delayed inode item
+	 * otherwise we corrupt the FS during
+	 * snapshot
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret) {	/* Transaction aborted */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+	btrfs_check_and_init_root_item(new_root_item);
 
 	root_flags = btrfs_root_flags(new_root_item);
 	if (pending->readonly)
@@ -978,13 +1239,65 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
 	btrfs_set_root_flags(new_root_item, root_flags);
 
+	btrfs_set_root_generation_v2(new_root_item,
+			trans->transid);
+	uuid_le_gen(&new_uuid);
+	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
+			BTRFS_UUID_SIZE);
+	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+		memset(new_root_item->received_uuid, 0,
+		       sizeof(new_root_item->received_uuid));
+		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+		btrfs_set_root_stransid(new_root_item, 0);
+		btrfs_set_root_rtransid(new_root_item, 0);
+	}
+	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
+	btrfs_set_root_otransid(new_root_item, trans->transid);
+
 	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	if (ret) {
+		btrfs_tree_unlock(old);
+		free_extent_buffer(old);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
 	btrfs_set_lock_blocking(old);
 
-	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+	/* clean up in any case */
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/*
+	 * We need to flush delayed refs in order to make sure all of our quota
+	 * operations have been done before we call btrfs_qgroup_inherit.
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_qgroup_inherit(trans, fs_info,
+				   root->root_key.objectid,
+				   objectid, pending->inherit);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/* see comments in should_cow_block() */
+	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+	smp_wmb();
 
 	btrfs_set_root_node(new_root_item, tmp);
 	/* record when the snapshot was created in key.offset */
@@ -992,28 +1305,88 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/*
 	 * insert root back/forward references
 	 */
 	ret = btrfs_add_root_ref(trans, tree_root, objectid,
 				 parent_root->root_key.objectid,
-				 parent_inode->i_ino, index,
+				 btrfs_ino(parent_inode), index,
 				 dentry->d_name.name, dentry->d_name.len);
-	BUG_ON(ret);
-	dput(parent);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	key.offset = (u64)-1;
 	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
-	BUG_ON(IS_ERR(pending->snap));
+	if (IS_ERR(pending->snap)) {
+		ret = PTR_ERR(pending->snap);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_reloc_post_snapshot(trans, pending);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				    dentry->d_name.name, dentry->d_name.len,
+				    parent_inode, &key,
+				    BTRFS_FT_DIR, index);
+	/* We have check then name at the beginning, so it is impossible. */
+	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
-	btrfs_reloc_post_snapshot(trans, pending);
-	btrfs_orphan_post_snapshot(trans, pending);
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
+				  BTRFS_UUID_KEY_SUBVOL, objectid);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
+		ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+					  new_root_item->received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  objectid);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto fail;
+		}
+	}
 fail:
+	pending->error = ret;
+dir_item_existed:
+	trans->block_rsv = rsv;
+	trans->bytes_reserved = 0;
+no_free_objectid:
 	kfree(new_root_item);
-	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
-	return 0;
+root_item_alloc_fail:
+	btrfs_free_path(path);
+	return ret;
 }
 
 /*
@@ -1022,15 +1395,17 @@ fail:
 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 					     struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_pending_snapshot *pending;
+	struct btrfs_pending_snapshot *pending, *next;
 	struct list_head *head = &trans->transaction->pending_snapshots;
-	int ret;
+	int ret = 0;
 
-	list_for_each_entry(pending, head, list) {
+	list_for_each_entry_safe(pending, next, head, list) {
+		list_del(&pending->list);
 		ret = create_pending_snapshot(trans, fs_info, pending);
-		BUG_ON(ret);
+		if (ret)
+			break;
 	}
-	return 0;
+	return ret;
 }
 
 static void update_super_roots(struct btrfs_root *root)
@@ -1038,7 +1413,7 @@ static void update_super_roots(struct btrfs_root *root)
 	struct btrfs_root_item *root_item;
 	struct btrfs_super_block *super;
 
-	super = &root->fs_info->super_copy;
+	super = root->fs_info->super_copy;
 
 	root_item = &root->fs_info->chunk_root->root_item;
 	super->chunk_root = root_item->bytenr;
@@ -1049,27 +1424,35 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root = root_item->bytenr;
 	super->generation = root_item->generation;
 	super->root_level = root_item->level;
-	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+	if (btrfs_test_opt(root, SPACE_CACHE))
 		super->cache_generation = root_item->generation;
+	if (root->fs_info->update_uuid_tree_gen)
+		super->uuid_tree_generation = root_item->generation;
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
+	struct btrfs_transaction *trans;
 	int ret = 0;
-	spin_lock(&info->new_trans_lock);
-	if (info->running_transaction)
-		ret = info->running_transaction->in_commit;
-	spin_unlock(&info->new_trans_lock);
+
+	spin_lock(&info->trans_lock);
+	trans = info->running_transaction;
+	if (trans)
+		ret = (trans->state >= TRANS_STATE_COMMIT_START);
+	spin_unlock(&info->trans_lock);
 	return ret;
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
+	struct btrfs_transaction *trans;
 	int ret = 0;
-	spin_lock(&info->new_trans_lock);
-	if (info->running_transaction)
-		ret = info->running_transaction->blocked;
-	spin_unlock(&info->new_trans_lock);
+
+	spin_lock(&info->trans_lock);
+	trans = info->running_transaction;
+	if (trans)
+		ret = is_transaction_blocked(trans);
+	spin_unlock(&info->trans_lock);
 	return ret;
 }
 
@@ -1080,24 +1463,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 static void wait_current_trans_commit_start(struct btrfs_root *root,
 					    struct btrfs_transaction *trans)
 {
-	DEFINE_WAIT(wait);
-
-	if (trans->in_commit)
-		return;
-
-	while (1) {
-		prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (trans->in_commit) {
-			finish_wait(&root->fs_info->transaction_blocked_wait,
-				    &wait);
-			break;
-		}
-		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
-		finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
-	}
+	wait_event(root->fs_info->transaction_blocked_wait,
+		   trans->state >= TRANS_STATE_COMMIT_START ||
+		   trans->aborted);
 }
 
 /*
@@ -1107,26 +1475,9 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
 static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 					 struct btrfs_transaction *trans)
 {
-	DEFINE_WAIT(wait);
-
-	if (trans->commit_done || (trans->in_commit && !trans->blocked))
-		return;
-
-	while (1) {
-		prepare_to_wait(&root->fs_info->transaction_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (trans->commit_done ||
-		    (trans->in_commit && !trans->blocked)) {
-			finish_wait(&root->fs_info->transaction_wait,
-				    &wait);
-			break;
-		}
-		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
-		finish_wait(&root->fs_info->transaction_wait,
-			    &wait);
-	}
+	wait_event(root->fs_info->transaction_wait,
+		   trans->state >= TRANS_STATE_UNBLOCKED ||
+		   trans->aborted);
 }
 
 /*
@@ -1136,13 +1487,24 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 struct btrfs_async_commit {
 	struct btrfs_trans_handle *newtrans;
 	struct btrfs_root *root;
-	struct delayed_work work;
+	struct work_struct work;
 };
 
 static void do_async_commit(struct work_struct *work)
 {
 	struct btrfs_async_commit *ac =
-		container_of(work, struct btrfs_async_commit, work.work);
+		container_of(work, struct btrfs_async_commit, work);
+
+	/*
+	 * We've got freeze protection passed with the transaction.
+	 * Tell lockdep about it.
+	 */
+	if (ac->newtrans->type & __TRANS_FREEZABLE)
+		rwsem_acquire_read(
+		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		     0, 1, _THIS_IP_);
+
+	current->journal_info = ac->newtrans;
 
 	btrfs_commit_transaction(ac->newtrans, ac->root);
 	kfree(ac);
@@ -1156,11 +1518,12 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 
 	ac = kmalloc(sizeof(*ac), GFP_NOFS);
-	BUG_ON(!ac);
+	if (!ac)
+		return -ENOMEM;
 
-	INIT_DELAYED_WORK(&ac->work, do_async_commit);
+	INIT_WORK(&ac->work, do_async_commit);
 	ac->root = root;
-	ac->newtrans = btrfs_join_transaction(root, 0);
+	ac->newtrans = btrfs_join_transaction(root);
 	if (IS_ERR(ac->newtrans)) {
 		int err = PTR_ERR(ac->newtrans);
 		kfree(ac);
@@ -1168,145 +1531,291 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	}
 
 	/* take transaction reference */
-	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = trans->transaction;
-	cur_trans->use_count++;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	atomic_inc(&cur_trans->use_count);
 
 	btrfs_end_transaction(trans, root);
-	schedule_delayed_work(&ac->work, 0);
+
+	/*
+	 * Tell lockdep we've released the freeze rwsem, since the
+	 * async commit thread will be the one to unlock it.
+	 */
+	if (ac->newtrans->type & __TRANS_FREEZABLE)
+		rwsem_release(
+			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			1, _THIS_IP_);
+
+	schedule_work(&ac->work);
 
 	/* wait for transaction to start and unblock */
-	mutex_lock(&root->fs_info->trans_mutex);
 	if (wait_for_unblock)
 		wait_current_trans_commit_start_and_unblock(root, cur_trans);
 	else
 		wait_current_trans_commit_start(root, cur_trans);
-	put_transaction(cur_trans);
-	mutex_unlock(&root->fs_info->trans_mutex);
 
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	btrfs_put_transaction(cur_trans);
 	return 0;
 }
 
-/*
- * btrfs_transaction state sequence:
- *    in_commit = 0, blocked = 0  (initial)
- *    in_commit = 1, blocked = 1
- *    blocked = 0
- *    commit_done = 1
- */
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, int err)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	DEFINE_WAIT(wait);
+
+	WARN_ON(trans->use_count > 1);
+
+	btrfs_abort_transaction(trans, root, err);
+
+	spin_lock(&root->fs_info->trans_lock);
+
+	/*
+	 * If the transaction is removed from the list, it means this
+	 * transaction has been committed successfully, so it is impossible
+	 * to call the cleanup function.
+	 */
+	BUG_ON(list_empty(&cur_trans->list));
+
+	list_del_init(&cur_trans->list);
+	if (cur_trans == root->fs_info->running_transaction) {
+		cur_trans->state = TRANS_STATE_COMMIT_DOING;
+		spin_unlock(&root->fs_info->trans_lock);
+		wait_event(cur_trans->writer_wait,
+			   atomic_read(&cur_trans->num_writers) == 1);
+
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+
+	btrfs_cleanup_one_transaction(trans->transaction, root);
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans == root->fs_info->running_transaction)
+		root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->trans_lock);
+
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+
+	trace_btrfs_transaction_commit(root);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+	btrfs_scrub_cancel(root->fs_info);
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root)
+{
+	int ret;
+
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret)
+		return ret;
+
+	/*
+	 * rename don't use btrfs_join_transaction, so, once we
+	 * set the transaction to blocked above, we aren't going
+	 * to get any new ordered operations.  We can safely run
+	 * it here and no for sure that nothing new will be added
+	 * to the list
+	 */
+	ret = btrfs_run_ordered_operations(trans, root, 1);
+
+	return ret;
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		return btrfs_start_delalloc_roots(fs_info, 1, -1);
+	return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		btrfs_wait_ordered_roots(fs_info, -1);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
-	unsigned long joined = 0;
-	struct btrfs_transaction *cur_trans;
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_transaction *prev_trans = NULL;
-	DEFINE_WAIT(wait);
 	int ret;
-	int should_grow = 0;
-	unsigned long now = get_seconds();
-	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 
-	btrfs_run_ordered_operations(root, 0);
+	ret = btrfs_run_ordered_operations(trans, root, 0);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	/* Stop the commit early if ->aborted is set */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
 	ret = btrfs_run_delayed_refs(trans, root, 0);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
 	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
 
 	cur_trans = trans->transaction;
+
 	/*
 	 * set the flushing flag so procs in this transaction have to
 	 * start sending their work down.
 	 */
 	cur_trans->delayed_refs.flushing = 1;
+	smp_wmb();
 
-	ret = btrfs_run_delayed_refs(trans, root, 0);
-	BUG_ON(ret);
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (cur_trans->in_commit) {
-		cur_trans->use_count++;
-		mutex_unlock(&root->fs_info->trans_mutex);
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	if (ret) {
 		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
-		ret = wait_for_commit(root, cur_trans);
-		BUG_ON(ret);
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+		spin_unlock(&root->fs_info->trans_lock);
+		atomic_inc(&cur_trans->use_count);
+		ret = btrfs_end_transaction(trans, root);
 
-		mutex_lock(&root->fs_info->trans_mutex);
-		put_transaction(cur_trans);
-		mutex_unlock(&root->fs_info->trans_mutex);
+		wait_for_commit(root, cur_trans);
 
-		return 0;
+		btrfs_put_transaction(cur_trans);
+
+		return ret;
 	}
 
-	trans->transaction->in_commit = 1;
-	trans->transaction->blocked = 1;
+	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
-		if (!prev_trans->commit_done) {
-			prev_trans->use_count++;
-			mutex_unlock(&root->fs_info->trans_mutex);
+		if (prev_trans->state != TRANS_STATE_COMPLETED) {
+			atomic_inc(&prev_trans->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
 
 			wait_for_commit(root, prev_trans);
 
-			mutex_lock(&root->fs_info->trans_mutex);
-			put_transaction(prev_trans);
+			btrfs_put_transaction(prev_trans);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
 		}
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
 	}
 
-	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
-		should_grow = 1;
+	extwriter_counter_dec(cur_trans, trans->type);
 
-	do {
-		int snap_pending = 0;
-		joined = cur_trans->num_joined;
-		if (!list_empty(&trans->transaction->pending_snapshots))
-			snap_pending = 1;
-
-		WARN_ON(cur_trans != trans->transaction);
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (flush_on_commit || snap_pending) {
-			btrfs_start_delalloc_inodes(root, 1);
-			ret = btrfs_wait_ordered_extents(root, 0, 1);
-			BUG_ON(ret);
-		}
+	ret = btrfs_start_delalloc_flush(root->fs_info);
+	if (ret)
+		goto cleanup_transaction;
 
-		/*
-		 * rename don't use btrfs_join_transaction, so, once we
-		 * set the transaction to blocked above, we aren't going
-		 * to get any new ordered operations.  We can safely run
-		 * it here and no for sure that nothing new will be added
-		 * to the list
-		 */
-		btrfs_run_ordered_operations(root, 1);
+	ret = btrfs_flush_all_pending_stuffs(trans, root);
+	if (ret)
+		goto cleanup_transaction;
+
+	wait_event(cur_trans->writer_wait,
+		   extwriter_counter_read(cur_trans) == 0);
 
-		prepare_to_wait(&cur_trans->writer_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
+	/* some pending stuffs might be added after the previous flush. */
+	ret = btrfs_flush_all_pending_stuffs(trans, root);
+	if (ret)
+		goto cleanup_transaction;
 
-		smp_mb();
-		if (cur_trans->num_writers > 1)
-			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-		else if (should_grow)
-			schedule_timeout(1);
+	btrfs_wait_delalloc_flush(root->fs_info);
 
-		mutex_lock(&root->fs_info->trans_mutex);
-		finish_wait(&cur_trans->writer_wait, &wait);
-	} while (cur_trans->num_writers > 1 ||
-		 (should_grow && cur_trans->num_joined != joined));
+	btrfs_scrub_pause(root);
+	/*
+	 * Ok now we need to make sure to block out any other joins while we
+	 * commit the transaction.  We could have started a join before setting
+	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
+	 */
+	spin_lock(&root->fs_info->trans_lock);
+	cur_trans->state = TRANS_STATE_COMMIT_DOING;
+	spin_unlock(&root->fs_info->trans_lock);
+	wait_event(cur_trans->writer_wait,
+		   atomic_read(&cur_trans->num_writers) == 1);
+
+	/* ->aborted might be set after the previous check, so check it */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		goto scrub_continue;
+	}
+	/*
+	 * the reloc mutex makes sure that we stop
+	 * the balancing code from coming in and moving
+	 * extents around in the middle of the commit
+	 */
+	mutex_lock(&root->fs_info->reloc_mutex);
 
+	/*
+	 * We needn't worry about the delayed items because we will
+	 * deal with them in create_pending_snapshot(), which is the
+	 * core function of the snapshot creation.
+	 */
 	ret = create_pending_snapshots(trans, root->fs_info);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * We insert the dir indexes of the snapshots and update the inode
+	 * of the snapshots' parents after the snapshot creation, so there
+	 * are some delayed items which are not dealt with. Now deal with
+	 * them.
+	 *
+	 * We needn't worry that this operation will corrupt the snapshots,
+	 * because all the tree which are snapshoted will be forced to COW
+	 * the nodes and leaves.
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * make sure none of the code above managed to slip in a
+	 * delayed item
+	 */
+	btrfs_assert_delayed_root_empty(root);
 
 	WARN_ON(cur_trans != trans->transaction);
 
@@ -1326,7 +1835,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->fs_info->tree_log_mutex);
 
 	ret = commit_fs_roots(trans, root);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * Since the transaction is done, we should set the inode map cache flag
+	 * before any other comming transaction.
+	 */
+	if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
+		btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+	else
+		btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
 
 	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
@@ -1334,41 +1856,68 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
 	ret = commit_cowonly_roots(trans, root);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
+	/*
+	 * The tasks which save the space cache and inode cache may also
+	 * update ->aborted, check it.
+	 */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
 
 	btrfs_prepare_extent_commit(trans, root);
 
 	cur_trans = root->fs_info->running_transaction;
-	spin_lock(&root->fs_info->new_trans_lock);
-	root->fs_info->running_transaction = NULL;
-	spin_unlock(&root->fs_info->new_trans_lock);
 
 	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
 			    root->fs_info->tree_root->node);
-	switch_commit_root(root->fs_info->tree_root);
+	list_add_tail(&root->fs_info->tree_root->dirty_list,
+		      &cur_trans->switch_commits);
 
 	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
 			    root->fs_info->chunk_root->node);
-	switch_commit_root(root->fs_info->chunk_root);
+	list_add_tail(&root->fs_info->chunk_root->dirty_list,
+		      &cur_trans->switch_commits);
 
-	update_super_roots(root);
+	switch_commit_roots(cur_trans, root->fs_info);
 
-	if (!root->fs_info->log_root_recovering) {
-		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
-		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
-	}
+	assert_qgroups_uptodate(trans);
+	update_super_roots(root);
 
-	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
-	       sizeof(root->fs_info->super_copy));
+	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+	btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
+	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+	       sizeof(*root->fs_info->super_copy));
 
-	trans->transaction->blocked = 0;
+	spin_lock(&root->fs_info->trans_lock);
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->trans_lock);
+	mutex_unlock(&root->fs_info->reloc_mutex);
 
 	wake_up(&root->fs_info->transaction_wait);
 
-	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
-	BUG_ON(ret);
-	write_ctree_super(trans, root, 0);
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Error while writing out transaction");
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		goto scrub_continue;
+	}
+
+	ret = write_ctree_super(trans, root, 0);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		goto scrub_continue;
+	}
 
 	/*
 	 * the super is written, we can safely allow the tree-loggers
@@ -1378,18 +1927,27 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->last_trans_committed = cur_trans->transid;
+	/*
+	 * We needn't acquire the lock here because there is no other task
+	 * which can change it.
+	 */
+	cur_trans->state = TRANS_STATE_COMPLETED;
+	wake_up(&cur_trans->commit_wait);
 
-	cur_trans->commit_done = 1;
+	spin_lock(&root->fs_info->trans_lock);
+	list_del_init(&cur_trans->list);
+	spin_unlock(&root->fs_info->trans_lock);
 
-	root->fs_info->last_trans_committed = cur_trans->transid;
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
-	wake_up(&cur_trans->commit_wait);
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
 
-	put_transaction(cur_trans);
-	put_transaction(cur_trans);
+	trace_btrfs_transaction_commit(root);
 
-	mutex_unlock(&root->fs_info->trans_mutex);
+	btrfs_scrub_continue(root);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -1400,29 +1958,61 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		btrfs_run_delayed_iputs(root);
 
 	return ret;
+
+scrub_continue:
+	btrfs_scrub_continue(root);
+cleanup_transaction:
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+	btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+	cleanup_transaction(trans, root, ret);
+
+	return ret;
 }
 
 /*
- * interface function to delete all the snapshots we have scheduled for deletion
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
  */
-int btrfs_clean_old_snapshots(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 {
-	LIST_HEAD(list);
+	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	mutex_lock(&fs_info->trans_mutex);
-	list_splice_init(&fs_info->dead_roots, &list);
-	mutex_unlock(&fs_info->trans_mutex);
+	spin_lock(&fs_info->trans_lock);
+	if (list_empty(&fs_info->dead_roots)) {
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
+	}
+	root = list_first_entry(&fs_info->dead_roots,
+			struct btrfs_root, root_list);
+	list_del_init(&root->root_list);
+	spin_unlock(&fs_info->trans_lock);
 
-	while (!list_empty(&list)) {
-		root = list_entry(list.next, struct btrfs_root, root_list);
-		list_del(&root->root_list);
+	pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
 
-		if (btrfs_header_backref_rev(root->node) <
-		    BTRFS_MIXED_BACKREF_REV)
-			btrfs_drop_snapshot(root, NULL, 0);
-		else
-			btrfs_drop_snapshot(root, NULL, 1);
-	}
-	return 0;
+	btrfs_kill_all_delayed_nodes(root);
+
+	if (btrfs_header_backref_rev(root->node) <
+			BTRFS_MIXED_BACKREF_REV)
+		ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+	else
+		ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+	/*
+	 * If we encounter a transaction abort during snapshot cleaning, we
+	 * don't want to crash here
+	 */
+	return (ret < 0) ? 0 : 1;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd..7dd558ed071 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -20,94 +20,138 @@
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "ctree.h"
+
+enum btrfs_trans_state {
+	TRANS_STATE_RUNNING		= 0,
+	TRANS_STATE_BLOCKED		= 1,
+	TRANS_STATE_COMMIT_START	= 2,
+	TRANS_STATE_COMMIT_DOING	= 3,
+	TRANS_STATE_UNBLOCKED		= 4,
+	TRANS_STATE_COMPLETED		= 5,
+	TRANS_STATE_MAX			= 6,
+};
 
 struct btrfs_transaction {
 	u64 transid;
 	/*
+	 * total external writers(USERSPACE/START/ATTACH) in this
+	 * transaction, it must be zero before the transaction is
+	 * being committed
+	 */
+	atomic_t num_extwriters;
+	/*
 	 * total writers in this transaction, it must be zero before the
 	 * transaction can end
 	 */
-	unsigned long num_writers;
+	atomic_t num_writers;
+	atomic_t use_count;
 
-	unsigned long num_joined;
-	int in_commit;
-	int use_count;
-	int commit_done;
-	int blocked;
+	/* Be protected by fs_info->trans_lock when we want to change it. */
+	enum btrfs_trans_state state;
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
+	struct list_head ordered_operations;
+	struct list_head pending_chunks;
+	struct list_head switch_commits;
 	struct btrfs_delayed_ref_root delayed_refs;
+	int aborted;
 };
 
+#define __TRANS_FREEZABLE	(1U << 0)
+
+#define __TRANS_USERSPACE	(1U << 8)
+#define __TRANS_START		(1U << 9)
+#define __TRANS_ATTACH		(1U << 10)
+#define __TRANS_JOIN		(1U << 11)
+#define __TRANS_JOIN_NOLOCK	(1U << 12)
+#define __TRANS_DUMMY		(1U << 13)
+
+#define TRANS_USERSPACE		(__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH		(__TRANS_ATTACH)
+#define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS	(__TRANS_USERSPACE | __TRANS_START |	\
+				 __TRANS_ATTACH)
+
+#define BTRFS_SEND_TRANS_STUB	1
+
 struct btrfs_trans_handle {
 	u64 transid;
-	u64 block_group;
 	u64 bytes_reserved;
+	u64 qgroup_reserved;
+	unsigned long use_count;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	unsigned long delayed_ref_updates;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *orig_rsv;
+	short aborted;
+	short adding_csums;
+	bool allocating_chunk;
+	bool reloc_reserved;
+	bool sync;
+	unsigned int type;
+	/*
+	 * this root is only needed to validate that the root passed to
+	 * start_transaction is the same as the one passed to end_transaction.
+	 * Subvolume quota depends on this
+	 */
+	struct btrfs_root *root;
+	struct seq_list delayed_ref_elem;
+	struct list_head qgroup_ref_list;
+	struct list_head new_bgs;
 };
 
 struct btrfs_pending_snapshot {
 	struct dentry *dentry;
+	struct inode *dir;
 	struct btrfs_root *root;
 	struct btrfs_root *snap;
+	struct btrfs_qgroup_inherit *inherit;
 	/* block reservation for the operation */
 	struct btrfs_block_rsv block_rsv;
+	u64 qgroup_reserved;
 	/* extra metadata reseration for relocation */
 	int error;
 	bool readonly;
 	struct list_head list;
 };
 
-static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
-					       struct inode *inode)
-{
-	trans->block_group = BTRFS_I(inode)->block_group;
-}
-
-static inline void btrfs_update_inode_block_group(
-					  struct btrfs_trans_handle *trans,
-					  struct inode *inode)
-{
-	BTRFS_I(inode)->block_group = trans->block_group;
-}
-
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 					      struct inode *inode)
 {
 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-						  int num_blocks);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
-							  int num_blocks);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-							 int num_blocks);
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+					struct btrfs_root *root, int num_items);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+					struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root);
 
-int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_drop_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
-int btrfs_clean_old_snapshots(struct btrfs_root *root);
+void btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
@@ -120,12 +164,11 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_write_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 992ab425599..a63719cc957 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,27 +23,24 @@
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
- * things from disk, otherwise read all the leaves and try to get key order to
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
  * better reflect disk order
  */
 
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, int cache_only)
+			struct btrfs_root *root)
 {
 	struct btrfs_path *path = NULL;
 	struct btrfs_key key;
 	int ret = 0;
 	int wret;
 	int level;
-	int is_extent = 0;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
 	u64 min_trans = 0;
 
-	if (cache_only)
-		goto out;
-
 	if (root->fs_info->extent_root == root) {
 		/*
 		 * there's recursion here right now in the tree locking,
@@ -52,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	if (root->ref_cows == 0 && !is_extent)
+	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		goto out;
 
 	if (btrfs_test_opt(root, SSD))
@@ -86,18 +83,15 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 
 	path->keep_locks = 1;
-	if (cache_only)
-		min_trans = root->defrag_trans_start;
 
-	ret = btrfs_search_forward(root, &key, NULL, path,
-				   cache_only, min_trans);
+	ret = btrfs_search_forward(root, &key, path, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
 		ret = 0;
 		goto out;
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 
 	if (wret < 0) {
@@ -109,11 +103,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
 					   min_trans);
 	ret = btrfs_realloc_node(trans, root,
 				 path->nodes[1], 0,
-				 cache_only, &last_ret,
+				 &last_ret,
 				 &root->defrag_progress);
 	if (ret) {
 		WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a4bbb854dfd..9e1f2cd5e67 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,14 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include "ctree.h"
-#include "transaction.h"
+#include <linux/blkdev.h>
+#include <linux/list_sort.h>
+#include "tree-log.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
-#include "compat.h"
-#include "tree-log.h"
+#include "backref.h"
+#include "hash.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -89,7 +90,8 @@
  */
 #define LOG_WALK_PIN_ONLY 0
 #define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_ALL 2
+#define LOG_WALK_REPLAY_DIR_INDEX 2
+#define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
@@ -132,43 +134,61 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
  * syncing the tree wait for us to finish
  */
 static int start_log_trans(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root)
+			   struct btrfs_root *root,
+			   struct btrfs_log_ctx *ctx)
 {
+	int index;
 	int ret;
-	int err = 0;
 
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
+		if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+			ret = -EAGAIN;
+			goto out;
+		}
 		if (!root->log_start_pid) {
 			root->log_start_pid = current->pid;
-			root->log_multiple_pids = false;
+			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		} else if (root->log_start_pid != current->pid) {
-			root->log_multiple_pids = true;
+			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		}
 
-		root->log_batch++;
+		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
+		if (ctx) {
+			index = root->log_transid % 2;
+			list_add_tail(&ctx->list, &root->log_ctxs[index]);
+			ctx->log_transid = root->log_transid;
+		}
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
-	root->log_multiple_pids = false;
-	root->log_start_pid = current->pid;
+
+	ret = 0;
 	mutex_lock(&root->fs_info->tree_log_mutex);
-	if (!root->fs_info->log_root_tree) {
+	if (!root->fs_info->log_root_tree)
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
-		if (ret)
-			err = ret;
-	}
-	if (err == 0 && !root->log_root) {
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	if (ret)
+		goto out;
+
+	if (!root->log_root) {
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)
-			err = ret;
+			goto out;
 	}
-	mutex_unlock(&root->fs_info->tree_log_mutex);
-	root->log_batch++;
+	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+	root->log_start_pid = current->pid;
+	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
+	if (ctx) {
+		index = root->log_transid % 2;
+		list_add_tail(&ctx->list, &root->log_ctxs[index]);
+		ctx->log_transid = root->log_transid;
+	}
+out:
 	mutex_unlock(&root->log_mutex);
-	return err;
+	return ret;
 }
 
 /*
@@ -212,14 +232,13 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
  * indicate we're done making changes to the log tree
  * and wake up anyone waiting to do a sync
  */
-int btrfs_end_log_trans(struct btrfs_root *root)
+void btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
 		smp_mb();
 		if (waitqueue_active(&root->log_writer_wait))
 			wake_up(&root->log_writer_wait);
 	}
-	return 0;
 }
 
 
@@ -275,17 +294,31 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct extent_buffer *eb,
 			      struct walk_control *wc, u64 gen)
 {
+	int ret = 0;
+
+	/*
+	 * If this fs is mixed then we need to be able to process the leaves to
+	 * pin down any logged extents, so we have to read the block.
+	 */
+	if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+		ret = btrfs_read_buffer(eb, gen);
+		if (ret)
+			return ret;
+	}
+
 	if (wc->pin)
-		btrfs_pin_extent(log->fs_info->extent_root,
-				 eb->start, eb->len, 0);
+		ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
+						      eb->start, eb->len);
 
-	if (btrfs_buffer_uptodate(eb, gen)) {
+	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+		if (wc->pin && btrfs_header_level(eb) == 0)
+			ret = btrfs_exclude_logged_extents(log, eb);
 		if (wc->write)
 			btrfs_write_tree_block(eb);
 		if (wc->wait)
 			btrfs_wait_tree_block_writeback(eb);
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -315,6 +348,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 	unsigned long src_ptr;
 	unsigned long dst_ptr;
 	int overwrite_root = 0;
+	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		overwrite_root = 1;
@@ -324,6 +358,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 
 	/* look for the key in the destination tree */
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
 	if (ret == 0) {
 		char *src_copy;
 		char *dst_copy;
@@ -333,13 +370,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 			goto insert;
 
 		if (item_size == 0) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			return 0;
 		}
 		dst_copy = kmalloc(item_size, GFP_NOFS);
 		src_copy = kmalloc(item_size, GFP_NOFS);
 		if (!dst_copy || !src_copy) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			kfree(dst_copy);
 			kfree(src_copy);
 			return -ENOMEM;
@@ -361,13 +398,57 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 		 * sync
 		 */
 		if (ret == 0) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			return 0;
 		}
 
+		/*
+		 * We need to load the old nbytes into the inode so when we
+		 * replay the extents we've logged we get the right nbytes.
+		 */
+		if (inode_item) {
+			struct btrfs_inode_item *item;
+			u64 nbytes;
+			u32 mode;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+			item = btrfs_item_ptr(eb, slot,
+					      struct btrfs_inode_item);
+			btrfs_set_inode_nbytes(eb, item, nbytes);
+
+			/*
+			 * If this is a directory we need to reset the i_size to
+			 * 0 so that we can set it up properly when replaying
+			 * the rest of the items in this log.
+			 */
+			mode = btrfs_inode_mode(eb, item);
+			if (S_ISDIR(mode))
+				btrfs_set_inode_size(eb, item, 0);
+		}
+	} else if (inode_item) {
+		struct btrfs_inode_item *item;
+		u32 mode;
+
+		/*
+		 * New inode, set nbytes to 0 so that the nbytes comes out
+		 * properly when we replay the extents.
+		 */
+		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+		btrfs_set_inode_nbytes(eb, item, 0);
+
+		/*
+		 * If this is a directory we need to reset the i_size to 0 so
+		 * that we can set it up properly when replaying the rest of
+		 * the items in this log.
+		 */
+		mode = btrfs_inode_mode(eb, item);
+		if (S_ISDIR(mode))
+			btrfs_set_inode_size(eb, item, 0);
 	}
 insert:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	/* try to insert the key into the destination tree */
 	ret = btrfs_insert_empty_item(trans, root, path,
 				      key, item_size);
@@ -377,13 +458,11 @@ insert:
 		u32 found_size;
 		found_size = btrfs_item_size_nr(path->nodes[0],
 						path->slots[0]);
-		if (found_size > item_size) {
-			btrfs_truncate_item(trans, root, path, item_size, 1);
-		} else if (found_size < item_size) {
-			ret = btrfs_extend_item(trans, root, path,
-						item_size - found_size);
-			BUG_ON(ret);
-		}
+		if (found_size > item_size)
+			btrfs_truncate_item(root, path, item_size, 1);
+		else if (found_size < item_size)
+			btrfs_extend_item(root, path,
+					  item_size - found_size);
 	} else if (ret) {
 		return ret;
 	}
@@ -438,7 +517,7 @@ insert:
 	}
 no_copy:
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	return 0;
 }
 
@@ -484,11 +563,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_key *key)
 {
 	int found_type;
-	u64 mask = root->sectorsize - 1;
 	u64 extent_end;
-	u64 alloc_hint;
 	u64 start = key->offset;
-	u64 saved_nbytes;
+	u64 nbytes = 0;
 	struct btrfs_file_extent_item *item;
 	struct inode *inode = NULL;
 	unsigned long size;
@@ -498,11 +575,20 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	found_type = btrfs_file_extent_type(eb, item);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
-		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
-	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		size = btrfs_file_extent_inline_len(eb, item);
-		extent_end = (start + size + mask) & ~mask;
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		nbytes = btrfs_file_extent_num_bytes(eb, item);
+		extent_end = start + nbytes;
+
+		/*
+		 * We don't add to the inodes nbytes if we are prealloc or a
+		 * hole.
+		 */
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			nbytes = 0;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb, slot, item);
+		nbytes = btrfs_file_extent_ram_bytes(eb, item);
+		extent_end = ALIGN(start + size, root->sectorsize);
 	} else {
 		ret = 0;
 		goto out;
@@ -519,7 +605,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	 * file.  This must be done before the btrfs_drop_extents run
 	 * so we don't try to drop this extent.
 	 */
-	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 				       start, 0);
 
 	if (ret == 0 &&
@@ -544,17 +630,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		 * we don't have to do anything
 		 */
 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			goto out;
 		}
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, inode, start, extent_end,
-				 &alloc_hint, 1);
-	BUG_ON(ret);
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+	if (ret)
+		goto out;
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -564,7 +649,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_insert_empty_item(trans, root, path, key,
 					      sizeof(*item));
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 						    path->slots[0]);
 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
@@ -589,7 +675,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset);
+						key->objectid, offset, 0);
+				if (ret)
+					goto out;
 			} else {
 				/*
 				 * insert the extent pointer in the extent
@@ -598,9 +686,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_alloc_logged_file_extent(trans,
 						root, root->root_key.objectid,
 						key->objectid, offset, &ins);
-				BUG_ON(ret);
+				if (ret)
+					goto out;
 			}
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 
 			if (btrfs_file_extent_compression(eb, item)) {
 				csum_start = ins.objectid;
@@ -614,31 +703,35 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 			ret = btrfs_lookup_csums_range(root->log_root,
 						csum_start, csum_end - 1,
-						&ordered_sums);
-			BUG_ON(ret);
+						&ordered_sums, 0);
+			if (ret)
+				goto out;
 			while (!list_empty(&ordered_sums)) {
 				struct btrfs_ordered_sum *sums;
 				sums = list_entry(ordered_sums.next,
 						struct btrfs_ordered_sum,
 						list);
-				ret = btrfs_csum_file_blocks(trans,
+				if (!ret)
+					ret = btrfs_csum_file_blocks(trans,
 						root->fs_info->csum_root,
 						sums);
-				BUG_ON(ret);
 				list_del(&sums->list);
 				kfree(sums);
 			}
+			if (ret)
+				goto out;
 		} else {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 		}
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		/* inline extents are easy, we just overwrite them */
 		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 	}
 
-	inode_set_bytes(inode, saved_nbytes);
-	btrfs_update_inode(trans, root, inode);
+	inode_add_bytes(inode, nbytes);
+	ret = btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
 		iput(inode);
@@ -675,18 +768,25 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	inode = read_one_inode(root, location.objectid);
-	BUG_ON(!inode);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
 
 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
+	else
+		ret = btrfs_run_delayed_items(trans, root);
+out:
 	kfree(name);
-
 	iput(inode);
 	return ret;
 }
@@ -713,7 +813,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
 			goto out;
 	} else
 		goto out;
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 	if (di && !IS_ERR(di)) {
@@ -724,7 +824,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
 		goto out;
 	match = 1;
 out:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	return match;
 }
 
@@ -740,6 +840,7 @@ out:
  */
 static noinline int backref_in_log(struct btrfs_root *log,
 				   struct btrfs_key *key,
+				   u64 ref_objectid,
 				   char *name, int namelen)
 {
 	struct btrfs_path *path;
@@ -760,8 +861,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	if (ret != 0)
 		goto out;
 
-	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, namelen, NULL))
+			match = 1;
+
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		ref = (struct btrfs_inode_ref *)ptr;
@@ -782,84 +892,42 @@ out:
 	return match;
 }
 
-
-/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function.  (it should be released on return).
- */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  struct btrfs_root *log,
 				  struct btrfs_path *path,
-				  struct extent_buffer *eb, int slot,
-				  struct btrfs_key *key)
+				  struct btrfs_root *log_root,
+				  struct inode *dir, struct inode *inode,
+				  struct extent_buffer *eb,
+				  u64 inode_objectid, u64 parent_objectid,
+				  u64 ref_index, char *name, int namelen,
+				  int *search_done)
 {
-	struct inode *dir;
 	int ret;
-	struct btrfs_inode_ref *ref;
+	char *victim_name;
+	int victim_name_len;
+	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
-	struct inode *inode;
-	char *name;
-	int namelen;
-	unsigned long ref_ptr;
-	unsigned long ref_end;
-
-	/*
-	 * it is possible that we didn't log all the parent directories
-	 * for a given inode.  If we don't find the dir, just don't
-	 * copy the back ref in.  The link count fixup code will take
-	 * care of the rest
-	 */
-	dir = read_one_inode(root, key->offset);
-	if (!dir)
-		return -ENOENT;
-
-	inode = read_one_inode(root, key->objectid);
-	BUG_ON(!inode);
-
-	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+	struct btrfs_key search_key;
+	struct btrfs_inode_extref *extref;
 
 again:
-	ref = (struct btrfs_inode_ref *)ref_ptr;
-
-	namelen = btrfs_inode_ref_name_len(eb, ref);
-	name = kmalloc(namelen, GFP_NOFS);
-	BUG_ON(!name);
-
-	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
-
-	/* if we already have a perfect match, we're done */
-	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
-			 btrfs_inode_ref_index(eb, ref),
-			 name, namelen)) {
-		goto out;
-	}
-
-	/*
-	 * look for a conflicting back reference in the metadata.
-	 * if we find one we have to unlink that name of the file
-	 * before we add our new link.  Later on, we overwrite any
-	 * existing back reference, and we don't want to create
-	 * dangling pointers in the directory.
-	 */
-conflict_again:
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	/* Search old style refs */
+	search_key.objectid = inode_objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = parent_objectid;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret == 0) {
-		char *victim_name;
-		int victim_name_len;
 		struct btrfs_inode_ref *victim_ref;
 		unsigned long ptr;
 		unsigned long ptr_end;
-		struct extent_buffer *leaf = path->nodes[0];
+
+		leaf = path->nodes[0];
 
 		/* are we trying to overwrite a back ref for the root directory
 		 * if so, just jump out, we're done
 		 */
-		if (key->objectid == key->offset)
-			goto out_nowrite;
+		if (search_key.objectid == search_key.offset)
+			return 1;
 
 		/* check all the names in this back reference to see
 		 * if they are in the log.  if so, we allow them to stay
@@ -872,116 +940,385 @@ conflict_again:
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
 								   victim_ref);
 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
-			BUG_ON(!victim_name);
+			if (!victim_name)
+				return -ENOMEM;
 
 			read_extent_buffer(leaf, victim_name,
 					   (unsigned long)(victim_ref + 1),
 					   victim_name_len);
 
-			if (!backref_in_log(log, key, victim_name,
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid,
+					    victim_name,
 					    victim_name_len)) {
-				btrfs_inc_nlink(inode);
-				btrfs_release_path(root, path);
+				inc_nlink(inode);
+				btrfs_release_path(path);
 
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
 				kfree(victim_name);
-				btrfs_release_path(root, path);
-				goto conflict_again;
+				if (ret)
+					return ret;
+				ret = btrfs_run_delayed_items(trans, root);
+				if (ret)
+					return ret;
+				*search_done = 1;
+				goto again;
 			}
 			kfree(victim_name);
+
 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 		}
-		BUG_ON(ret);
+
+		/*
+		 * NOTE: we have searched root tree and checked the
+		 * coresponding ref, it does not need to check again.
+		 */
+		*search_done = 1;
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
+
+	/* Same search but for extended refs */
+	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+					   inode_objectid, parent_objectid, 0,
+					   0);
+	if (!IS_ERR_OR_NULL(extref)) {
+		u32 item_size;
+		u32 cur_offset = 0;
+		unsigned long base;
+		struct inode *victim_parent;
+
+		leaf = path->nodes[0];
+
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *)base + cur_offset;
+
+			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+				goto next;
+
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			if (!victim_name)
+				return -ENOMEM;
+			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+					   victim_name_len);
+
+			search_key.objectid = inode_objectid;
+			search_key.type = BTRFS_INODE_EXTREF_KEY;
+			search_key.offset = btrfs_extref_hash(parent_objectid,
+							      victim_name,
+							      victim_name_len);
+			ret = 0;
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid, victim_name,
+					    victim_name_len)) {
+				ret = -ENOENT;
+				victim_parent = read_one_inode(root,
+							       parent_objectid);
+				if (victim_parent) {
+					inc_nlink(inode);
+					btrfs_release_path(path);
+
+					ret = btrfs_unlink_inode(trans, root,
+								 victim_parent,
+								 inode,
+								 victim_name,
+								 victim_name_len);
+					if (!ret)
+						ret = btrfs_run_delayed_items(
+								  trans, root);
+				}
+				iput(victim_parent);
+				kfree(victim_name);
+				if (ret)
+					return ret;
+				*search_done = 1;
+				goto again;
+			}
+			kfree(victim_name);
+			if (ret)
+				return ret;
+next:
+			cur_offset += victim_name_len + sizeof(*extref);
+		}
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
 
 	/* look for a conflicting sequence number */
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 btrfs_inode_ref_index(eb, ref),
-					 name, namelen, 0);
+	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+					 ref_index, name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-
-	/* look for a conflicting name */
-	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+	/* look for a conflicing name */
+	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
 				   name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
-	/* insert our name */
-	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-			     btrfs_inode_ref_index(eb, ref));
-	BUG_ON(ret);
+	return 0;
+}
 
-	btrfs_update_inode(trans, root, inode);
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			     u32 *namelen, char **name, u64 *index,
+			     u64 *parent_objectid)
+{
+	struct btrfs_inode_extref *extref;
 
-out:
-	ref_ptr = (unsigned long)(ref + 1) + namelen;
-	kfree(name);
-	if (ref_ptr < ref_end)
-		goto again;
+	extref = (struct btrfs_inode_extref *)ref_ptr;
+
+	*namelen = btrfs_inode_extref_name_len(eb, extref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+			   *namelen);
+
+	*index = btrfs_inode_extref_index(eb, extref);
+	if (parent_objectid)
+		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+	return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			  u32 *namelen, char **name, u64 *index)
+{
+	struct btrfs_inode_ref *ref;
+
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	*namelen = btrfs_inode_ref_name_len(eb, ref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+	*index = btrfs_inode_ref_index(eb, ref);
+
+	return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir = NULL;
+	struct inode *inode = NULL;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+	char *name = NULL;
+	int namelen;
+	int ret;
+	int search_done = 0;
+	int log_ref_ver = 0;
+	u64 parent_objectid;
+	u64 inode_objectid;
+	u64 ref_index = 0;
+	int ref_struct_size;
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *r;
+
+		ref_struct_size = sizeof(struct btrfs_inode_extref);
+		log_ref_ver = 1;
+		r = (struct btrfs_inode_extref *)ref_ptr;
+		parent_objectid = btrfs_inode_extref_parent(eb, r);
+	} else {
+		ref_struct_size = sizeof(struct btrfs_inode_ref);
+		parent_objectid = key->offset;
+	}
+	inode_objectid = key->objectid;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, parent_objectid);
+	if (!dir) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	inode = read_one_inode(root, inode_objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	while (ref_ptr < ref_end) {
+		if (log_ref_ver) {
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						&ref_index, &parent_objectid);
+			/*
+			 * parent object can change from one array
+			 * item to another.
+			 */
+			if (!dir)
+				dir = read_one_inode(root, parent_objectid);
+			if (!dir) {
+				ret = -ENOENT;
+				goto out;
+			}
+		} else {
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     &ref_index);
+		}
+		if (ret)
+			goto out;
+
+		/* if we already have a perfect match, we're done */
+		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+				  ref_index, name, namelen)) {
+			/*
+			 * look for a conflicting back reference in the
+			 * metadata. if we find one we have to unlink that name
+			 * of the file before we add our new link.  Later on, we
+			 * overwrite any existing back reference, and we don't
+			 * want to create dangling pointers in the directory.
+			 */
+
+			if (!search_done) {
+				ret = __add_inode_ref(trans, root, path, log,
+						      dir, inode, eb,
+						      inode_objectid,
+						      parent_objectid,
+						      ref_index, name, namelen,
+						      &search_done);
+				if (ret) {
+					if (ret == 1)
+						ret = 0;
+					goto out;
+				}
+			}
+
+			/* insert our name */
+			ret = btrfs_add_link(trans, dir, inode, name, namelen,
+					     0, ref_index);
+			if (ret)
+				goto out;
+
+			btrfs_update_inode(trans, root, inode);
+		}
+
+		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+		kfree(name);
+		name = NULL;
+		if (log_ref_ver) {
+			iput(dir);
+			dir = NULL;
+		}
+	}
 
 	/* finally write the back reference in the inode */
 	ret = overwrite_item(trans, root, path, eb, slot, key);
-	BUG_ON(ret);
-
-out_nowrite:
-	btrfs_release_path(root, path);
+out:
+	btrfs_release_path(path);
+	kfree(name);
 	iput(dir);
 	iput(inode);
-	return 0;
+	return ret;
 }
 
 static int insert_orphan_item(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root, u64 offset)
 {
 	int ret;
-	ret = btrfs_find_orphan_item(root, offset);
+	ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
+			offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
 	if (ret > 0)
 		ret = btrfs_insert_orphan_item(trans, root, offset);
 	return ret;
 }
 
+static int count_inode_extrefs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
+{
+	int ret = 0;
+	int name_len;
+	unsigned int nlink = 0;
+	u32 item_size;
+	u32 cur_offset = 0;
+	u64 inode_objectid = btrfs_ino(inode);
+	u64 offset = 0;
+	unsigned long ptr;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
 
-/*
- * There are a few corners where the link count of the file can't
- * be properly maintained during replay.  So, instead of adding
- * lots of complexity to the log code, we just scan the backrefs
- * for any file that has been through replay.
- *
- * The scan will update the link count on the inode to reflect the
- * number of back refs found.  If it goes down to zero, the iput
- * will free the inode.
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct inode *inode)
+	while (1) {
+		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+					    &extref, &offset);
+		if (ret)
+			break;
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+			name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			nlink++;
+
+			cur_offset += name_len + sizeof(*extref);
+		}
+
+		offset++;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+
+	if (ret < 0)
+		return ret;
+	return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
 {
-	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	u64 nlink = 0;
+	unsigned int nlink = 0;
 	unsigned long ptr;
 	unsigned long ptr_end;
 	int name_len;
+	u64 ino = btrfs_ino(inode);
 
-	key.objectid = inode->i_ino;
+	key.objectid = ino;
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -991,9 +1328,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 				break;
 			path->slots[0]--;
 		}
+process_slot:
 		btrfs_item_key_to_cpu(path->nodes[0], &key,
 				      path->slots[0]);
-		if (key.objectid != inode->i_ino ||
+		if (key.objectid != ino ||
 		    key.type != BTRFS_INODE_REF_KEY)
 			break;
 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
@@ -1011,12 +1349,60 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 		if (key.offset == 0)
 			break;
+		if (path->slots[0] > 0) {
+			path->slots[0]--;
+			goto process_slot;
+		}
 		key.offset--;
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
+
+	return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	u64 nlink = 0;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = count_inode_refs(root, inode, path);
+	if (ret < 0)
+		goto out;
+
+	nlink = ret;
+
+	ret = count_inode_extrefs(root, inode, path);
+	if (ret == -ENOENT)
+		ret = 0;
+
+	if (ret < 0)
+		goto out;
+
+	nlink += ret;
+
+	ret = 0;
+
 	if (nlink != inode->i_nlink) {
-		inode->i_nlink = nlink;
+		set_nlink(inode, nlink);
 		btrfs_update_inode(trans, root, inode);
 	}
 	BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1024,15 +1410,16 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	if (inode->i_nlink == 0) {
 		if (S_ISDIR(inode->i_mode)) {
 			ret = replay_dir_deletes(trans, root, NULL, path,
-						 inode->i_ino, 1);
-			BUG_ON(ret);
+						 ino, 1);
+			if (ret)
+				goto out;
 		}
-		ret = insert_orphan_item(trans, root, inode->i_ino);
-		BUG_ON(ret);
+		ret = insert_orphan_item(trans, root, ino);
 	}
-	btrfs_free_path(path);
 
-	return 0;
+out:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1063,16 +1450,18 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 			break;
 
 		ret = btrfs_del_item(trans, root, path);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		inode = read_one_inode(root, key.offset);
-		BUG_ON(!inode);
+		if (!inode)
+			return -EIO;
 
 		ret = fixup_inode_link_count(trans, root, inode);
-		BUG_ON(ret);
-
 		iput(inode);
+		if (ret)
+			goto out;
 
 		/*
 		 * fixup on a directory may create new entries,
@@ -1081,8 +1470,10 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 		 */
 		key.offset = (u64)-1;
 	}
-	btrfs_release_path(root, path);
-	return 0;
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
 }
 
 
@@ -1101,7 +1492,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 	struct inode *inode;
 
 	inode = read_one_inode(root, objectid);
-	BUG_ON(!inode);
+	if (!inode)
+		return -EIO;
 
 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
@@ -1109,14 +1501,17 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
 
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	if (ret == 0) {
-		btrfs_inc_nlink(inode);
-		btrfs_update_inode(trans, root, inode);
+		if (!inode->i_nlink)
+			set_nlink(inode, 1);
+		else
+			inc_nlink(inode);
+		ret = btrfs_update_inode(trans, root, inode);
 	} else if (ret == -EEXIST) {
 		ret = 0;
 	} else {
-		BUG();
+		BUG(); /* Logic Error */
 	}
 	iput(inode);
 
@@ -1148,6 +1543,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 		iput(inode);
 		return -EIO;
 	}
+
 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
 
 	/* FIXME, put inode into FIXUP list */
@@ -1185,15 +1581,19 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	struct inode *dir;
 	u8 log_type;
 	int exists;
-	int ret;
+	int ret = 0;
+	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
 
 	dir = read_one_inode(root, key->objectid);
-	BUG_ON(!dir);
+	if (!dir)
+		return -EIO;
 
 	name_len = btrfs_dir_name_len(eb, di);
 	name = kmalloc(name_len, GFP_NOFS);
-	if (!name)
-		return -ENOMEM;
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	log_type = btrfs_dir_type(eb, di);
 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
@@ -1205,7 +1605,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		exists = 1;
 	else
 		exists = 0;
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	if (key->type == BTRFS_DIR_ITEM_KEY) {
 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1216,9 +1616,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 						     key->offset, name,
 						     name_len, 1);
 	} else {
-		BUG();
+		/* Corruption */
+		ret = -EINVAL;
+		goto out;
 	}
-	if (!dst_di || IS_ERR(dst_di)) {
+	if (IS_ERR_OR_NULL(dst_di)) {
 		/* we need a sequence number to insert, so we only
 		 * do inserts for the BTRFS_DIR_INDEX_KEY types
 		 */
@@ -1244,22 +1646,29 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		goto out;
 
 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	if (key->type == BTRFS_DIR_INDEX_KEY)
 		goto insert;
 out:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
+	if (!ret && update_size) {
+		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
+		ret = btrfs_update_inode(trans, root, dir);
+	}
 	kfree(name);
 	iput(dir);
-	return 0;
+	return ret;
 
 insert:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
 			      name, name_len, log_type, &log_key);
-
-	BUG_ON(ret && ret != -ENOENT);
+	if (ret && ret != -ENOENT)
+		goto out;
+	update_size = false;
+	ret = 0;
 	goto out;
 }
 
@@ -1286,9 +1695,12 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
+		if (verify_dir_item(root, eb, di))
+			return -EIO;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 		ptr = (unsigned long)(di + 1);
 		ptr += name_len;
 	}
@@ -1374,7 +1786,7 @@ next:
 	*end_ret = found_end;
 	ret = 0;
 out:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	return ret;
 }
 
@@ -1412,6 +1824,11 @@ again:
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
+		if (verify_dir_item(root, eb, di)) {
+			ret = -EIO;
+			goto out;
+		}
+
 		name_len = btrfs_dir_name_len(eb, di);
 		name = kmalloc(name_len, GFP_NOFS);
 		if (!name) {
@@ -1432,22 +1849,33 @@ again:
 						     dir_key->offset,
 						     name, name_len, 0);
 		}
-		if (!log_di || IS_ERR(log_di)) {
+		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
 			btrfs_dir_item_key_to_cpu(eb, di, &location);
-			btrfs_release_path(root, path);
-			btrfs_release_path(log, log_path);
+			btrfs_release_path(path);
+			btrfs_release_path(log_path);
 			inode = read_one_inode(root, location.objectid);
-			BUG_ON(!inode);
+			if (!inode) {
+				kfree(name);
+				return -EIO;
+			}
 
 			ret = link_to_fixup_dir(trans, root,
 						path, location.objectid);
-			BUG_ON(ret);
-			btrfs_inc_nlink(inode);
+			if (ret) {
+				kfree(name);
+				iput(inode);
+				goto out;
+			}
+
+			inc_nlink(inode);
 			ret = btrfs_unlink_inode(trans, root, dir, inode,
 						 name, name_len);
-			BUG_ON(ret);
+			if (!ret)
+				ret = btrfs_run_delayed_items(trans, root);
 			kfree(name);
 			iput(inode);
+			if (ret)
+				goto out;
 
 			/* there might still be more names under this key
 			 * check and repeat if required
@@ -1458,8 +1886,11 @@ again:
 				goto again;
 			ret = 0;
 			goto out;
+		} else if (IS_ERR(log_di)) {
+			kfree(name);
+			return PTR_ERR(log_di);
 		}
-		btrfs_release_path(log, log_path);
+		btrfs_release_path(log_path);
 		kfree(name);
 
 		ptr = (unsigned long)(di + 1);
@@ -1467,8 +1898,8 @@ again:
 	}
 	ret = 0;
 out:
-	btrfs_release_path(root, path);
-	btrfs_release_path(log, log_path);
+	btrfs_release_path(path);
+	btrfs_release_path(log_path);
 	return ret;
 }
 
@@ -1551,12 +1982,13 @@ again:
 			ret = check_item_in_log(trans, root, log, path,
 						log_path, dir,
 						&found_key);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 			if (found_key.offset == (u64)-1)
 				break;
 			dir_key.offset = found_key.offset + 1;
 		}
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		if (range_end == (u64)-1)
 			break;
 		range_start = range_end + 1;
@@ -1567,11 +1999,11 @@ next_type:
 	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
 		key_type = BTRFS_DIR_LOG_INDEX_KEY;
 		dir_key.type = BTRFS_DIR_INDEX_KEY;
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		goto again;
 	}
 out:
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 	btrfs_free_path(log_path);
 	iput(dir);
 	return ret;
@@ -1599,7 +2031,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	int i;
 	int ret;
 
-	btrfs_read_buffer(eb, gen);
+	ret = btrfs_read_buffer(eb, gen);
+	if (ret)
+		return ret;
 
 	level = btrfs_header_level(eb);
 
@@ -1607,7 +2041,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		return 0;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
 	nritems = btrfs_header_nritems(eb);
 	for (i = 0; i < nritems; i++) {
@@ -1625,11 +2060,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
 					 root, log, path, key.objectid, 0);
-				BUG_ON(ret);
+				if (ret)
+					break;
 			}
 			ret = overwrite_item(wc->trans, root, path,
 					     eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 
 			/* for regular files, make sure corresponding
 			 * orhpan item exist. extents past the new EOF
@@ -1638,13 +2075,24 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			if (S_ISREG(mode)) {
 				ret = insert_orphan_item(wc->trans, root,
 							 key.objectid);
-				BUG_ON(ret);
+				if (ret)
+					break;
 			}
 
 			ret = link_to_fixup_dir(wc->trans, root,
 						path, key.objectid);
-			BUG_ON(ret);
+			if (ret)
+				break;
+		}
+
+		if (key.type == BTRFS_DIR_INDEX_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			if (ret)
+				break;
 		}
+
 		if (wc->stage < LOG_WALK_REPLAY_ALL)
 			continue;
 
@@ -1652,24 +2100,29 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
 			ret = overwrite_item(wc->trans, root, path,
 					     eb, i, &key);
-			BUG_ON(ret);
-		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_INODE_REF_KEY ||
+			   key.type == BTRFS_INODE_EXTREF_KEY) {
 			ret = add_inode_ref(wc->trans, root, log, path,
 					    eb, i, &key);
-			BUG_ON(ret && ret != -ENOENT);
+			if (ret && ret != -ENOENT)
+				break;
+			ret = 0;
 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
-			BUG_ON(ret);
-		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
-			   key.type == BTRFS_DIR_INDEX_KEY) {
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
 			ret = replay_one_dir_item(wc->trans, root, path,
 						  eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 		}
 	}
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -1694,8 +2147,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
 
-		if (btrfs_header_level(cur) != *level)
-			WARN_ON(1);
+		WARN_ON(btrfs_header_level(cur) != *level);
 
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(cur))
@@ -1713,28 +2165,45 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 			return -ENOMEM;
 
 		if (*level == 1) {
-			wc->process_func(root, next, wc, ptr_gen);
+			ret = wc->process_func(root, next, wc, ptr_gen);
+			if (ret) {
+				free_extent_buffer(next);
+				return ret;
+			}
 
 			path->slots[*level]++;
 			if (wc->free) {
-				btrfs_read_buffer(next, ptr_gen);
+				ret = btrfs_read_buffer(next, ptr_gen);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
 
-				btrfs_tree_lock(next);
-				clean_tree_block(trans, root, next);
-				btrfs_set_lock_blocking(next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root, next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_reserved_extent(root,
+				ret = btrfs_free_and_pin_reserved_extent(root,
 							 bytenr, blocksize);
-				BUG_ON(ret);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
 			}
 			free_extent_buffer(next);
 			continue;
 		}
-		btrfs_read_buffer(next, ptr_gen);
+		ret = btrfs_read_buffer(next, ptr_gen);
+		if (ret) {
+			free_extent_buffer(next);
+			return ret;
+		}
 
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
@@ -1778,24 +2247,30 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				parent = path->nodes[*level + 1];
 
 			root_owner = btrfs_header_owner(parent);
-			wc->process_func(root, path->nodes[*level], wc,
+			ret = wc->process_func(root, path->nodes[*level], wc,
 				 btrfs_header_generation(path->nodes[*level]));
+			if (ret)
+				return ret;
+
 			if (wc->free) {
 				struct extent_buffer *next;
 
 				next = path->nodes[*level];
 
-				btrfs_tree_lock(next);
-				clean_tree_block(trans, root, next);
-				btrfs_set_lock_blocking(next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root, next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_reserved_extent(root,
+				ret = btrfs_free_and_pin_reserved_extent(root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len);
-				BUG_ON(ret);
+				if (ret)
+					return ret;
 			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -1817,11 +2292,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	int wret;
 	int level;
 	struct btrfs_path *path;
-	int i;
 	int orig_level;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
 	level = btrfs_header_level(log->node);
 	orig_level = level;
@@ -1833,45 +2308,49 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 		wret = walk_down_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
-		if (wret < 0)
+		if (wret < 0) {
 			ret = wret;
+			goto out;
+		}
 
 		wret = walk_up_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
-		if (wret < 0)
+		if (wret < 0) {
 			ret = wret;
+			goto out;
+		}
 	}
 
 	/* was the root node processed? if not, catch it here */
 	if (path->nodes[orig_level]) {
-		wc->process_func(log, path->nodes[orig_level], wc,
+		ret = wc->process_func(log, path->nodes[orig_level], wc,
 			 btrfs_header_generation(path->nodes[orig_level]));
+		if (ret)
+			goto out;
 		if (wc->free) {
 			struct extent_buffer *next;
 
 			next = path->nodes[orig_level];
 
-			btrfs_tree_lock(next);
-			clean_tree_block(trans, log, next);
-			btrfs_set_lock_blocking(next);
-			btrfs_wait_tree_block_writeback(next);
-			btrfs_tree_unlock(next);
+			if (trans) {
+				btrfs_tree_lock(next);
+				btrfs_set_lock_blocking(next);
+				clean_tree_block(trans, log, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+			}
 
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
-			ret = btrfs_free_reserved_extent(log, next->start,
+			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
 							 next->len);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 		}
 	}
 
-	for (i = 0; i <= orig_level; i++) {
-		if (path->nodes[i]) {
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
-	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1896,8 +2375,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, int transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -1912,33 +2391,61 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
 
-		if (root->fs_info->last_trans_log_full_commit !=
-		    trans->transid && root->log_transid < transid + 2 &&
+		if (root->log_transid_committed < transid &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->log_transid < transid + 2 &&
+	} while (root->log_transid_committed < transid &&
 		 atomic_read(&root->log_commit[index]));
-	return 0;
 }
 
-static int wait_for_writer(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
+
 	while (atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (root->fs_info->last_trans_log_full_commit !=
-		    trans->transid && atomic_read(&root->log_writers))
+		if (atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
 	}
-	return 0;
+}
+
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+					struct btrfs_log_ctx *ctx)
+{
+	if (!ctx)
+		return;
+
+	mutex_lock(&root->log_mutex);
+	list_del_init(&ctx->list);
+	mutex_unlock(&root->log_mutex);
+}
+
+/* 
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+					     int index, int error)
+{
+	struct btrfs_log_ctx *ctx;
+
+	if (!error) {
+		INIT_LIST_HEAD(&root->log_ctxs[index]);
+		return;
+	}
+
+	list_for_each_entry(ctx, &root->log_ctxs[index], list)
+		ctx->log_ret = error;
+
+	INIT_LIST_HEAD(&root->log_ctxs[index]);
 }
 
 /*
@@ -1954,7 +2461,7 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
  * that has happened.
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-		   struct btrfs_root *root)
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
 {
 	int index1;
 	int index2;
@@ -1962,41 +2469,52 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
-	unsigned long log_transid = 0;
+	int log_transid = 0;
+	struct btrfs_log_ctx root_log_ctx;
+	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
-	index1 = root->log_transid % 2;
+	log_transid = ctx->log_transid;
+	if (root->log_transid_committed >= log_transid) {
+		mutex_unlock(&root->log_mutex);
+		return ctx->log_ret;
+	}
+
+	index1 = log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(trans, root, root->log_transid);
+		wait_log_commit(trans, root, log_transid);
 		mutex_unlock(&root->log_mutex);
-		return 0;
+		return ctx->log_ret;
 	}
+	ASSERT(log_transid == root->log_transid);
 	atomic_set(&root->log_commit[index1], 1);
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(trans, root, root->log_transid - 1);
+		wait_log_commit(trans, root, log_transid - 1);
 
 	while (1) {
-		unsigned long batch = root->log_batch;
-		if (root->log_multiple_pids) {
+		int batch = atomic_read(&root->log_batch);
+		/* when we're on an ssd, just kick the log commit out */
+		if (!btrfs_test_opt(root, SSD) &&
+		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
 			mutex_unlock(&root->log_mutex);
 			schedule_timeout_uninterruptible(1);
 			mutex_lock(&root->log_mutex);
 		}
 		wait_for_writer(trans, root);
-		if (batch == root->log_batch)
+		if (batch == atomic_read(&root->log_batch))
 			break;
 	}
 
 	/* bail out if we need to do a full commit */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+	if (btrfs_need_log_full_commit(root->fs_info, trans)) {
 		ret = -EAGAIN;
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
 
-	log_transid = root->log_transid;
 	if (log_transid % 2 == 0)
 		mark = EXTENT_DIRTY;
 	else
@@ -2005,16 +2523,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	/* we start IO on  all the marked extents here, but we don't actually
 	 * wait for them until later.
 	 */
+	blk_start_plug(&plug);
 	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
-	BUG_ON(ret);
+	if (ret) {
+		blk_finish_plug(&plug);
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
 
 	btrfs_set_root_node(&log->root_item, log->node);
 
-	root->log_batch = 0;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
-	smp_mb();
 	/*
 	 * IO has been started, blocks of the log tree have WRITTEN flag set
 	 * in their headers. new modifications of the log will be written to
@@ -2022,9 +2546,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	mutex_unlock(&root->log_mutex);
 
+	btrfs_init_log_ctx(&root_log_ctx);
+
 	mutex_lock(&log_root_tree->log_mutex);
-	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
+
+	index2 = log_root_tree->log_transid % 2;
+	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+	root_log_ctx.log_transid = log_root_tree->log_transid;
+
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	ret = update_log_root(trans, log);
@@ -2037,28 +2568,47 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	if (ret) {
-		BUG_ON(ret != -ENOSPC);
-		root->fs_info->last_trans_log_full_commit = trans->transid;
+		if (!list_empty(&root_log_ctx.list))
+			list_del_init(&root_log_ctx.list);
+
+		blk_finish_plug(&plug);
+		btrfs_set_log_full_commit(root->fs_info, trans);
+
+		if (ret != -ENOSPC) {
+			btrfs_abort_transaction(trans, root, ret);
+			mutex_unlock(&log_root_tree->log_mutex);
+			goto out;
+		}
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out;
 	}
 
-	index2 = log_root_tree->log_transid % 2;
+	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = root_log_ctx.log_ret;
+		goto out;
+	}
+
+	index2 = root_log_ctx.log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid);
+				root_log_ctx.log_transid);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = 0;
+		ret = root_log_ctx.log_ret;
 		goto out;
 	}
+	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
 		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid - 1);
+				root_log_ctx.log_transid - 1);
 	}
 
 	wait_for_writer(trans, log_root_tree);
@@ -2067,28 +2617,38 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * now that we've moved on to the tree of log tree roots,
 	 * check the full commit flag again
 	 */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+	if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out_wake_log_root;
 	}
 
-	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
-				&log_root_tree->dirty_log_pages,
-				EXTENT_DIRTY | EXTENT_NEW);
-	BUG_ON(ret);
+	ret = btrfs_write_marked_extents(log_root_tree,
+					 &log_root_tree->dirty_log_pages,
+					 EXTENT_DIRTY | EXTENT_NEW);
+	blk_finish_plug(&plug);
+	if (ret) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out_wake_log_root;
+	}
 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+	btrfs_wait_marked_extents(log_root_tree,
+				  &log_root_tree->dirty_log_pages,
+				  EXTENT_NEW | EXTENT_DIRTY);
+	btrfs_wait_logged_extents(log, log_transid);
 
-	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
-	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
-	log_root_tree->log_batch = 0;
 	log_root_tree->log_transid++;
-	smp_mb();
-
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	/*
@@ -2098,8 +2658,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * the running transaction open, so a full commit can't hop
 	 * in and cause problems either.
 	 */
-	write_ctree_super(trans, root->fs_info->tree_root, 1);
-	ret = 0;
+	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
+	if (ret) {
+		btrfs_set_log_full_commit(root->fs_info, trans);
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_wake_log_root;
+	}
 
 	mutex_lock(&root->log_mutex);
 	if (root->last_log_commit < log_transid)
@@ -2107,13 +2671,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->log_mutex);
 
 out_wake_log_root:
+	/*
+	 * We needn't get log_mutex here because we are sure all
+	 * the other tasks are blocked.
+	 */
+	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_transid_committed++;
 	atomic_set(&log_root_tree->log_commit[index2], 0);
-	smp_mb();
+	mutex_unlock(&log_root_tree->log_mutex);
+
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
 		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
+	/* See above. */
+	btrfs_remove_all_log_ctxs(root, index1, ret);
+
+	mutex_lock(&root->log_mutex);
+	root->log_transid_committed++;
 	atomic_set(&root->log_commit[index1], 0);
-	smp_mb();
+	mutex_unlock(&root->log_mutex);
+
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
 	return ret;
@@ -2131,11 +2710,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 	};
 
 	ret = walk_log_tree(trans, log, &wc);
-	BUG_ON(ret);
+	/* I don't think this can happen but just in case */
+	if (ret)
+		btrfs_abort_transaction(trans, log, ret);
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				NULL);
 		if (ret)
 			break;
 
@@ -2143,6 +2725,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
 	}
 
+	/*
+	 * We may have short-circuited the log tree with the full commit logic
+	 * and left ordered extents on our list, so clear these out to keep us
+	 * from leaking inodes and memory.
+	 */
+	btrfs_free_logged_extents(log, 0);
+	btrfs_free_logged_extents(log, 1);
+
 	free_extent_buffer(log->node);
 	kfree(log);
 }
@@ -2202,6 +2792,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 	int bytes_del = 0;
+	u64 dir_ino = btrfs_ino(dir);
 
 	if (BTRFS_I(dir)->logged_trans < trans->transid)
 		return 0;
@@ -2214,10 +2805,12 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
 	log = root->log_root;
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
 
-	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
 				   name, name_len, -1);
 	if (IS_ERR(di)) {
 		err = PTR_ERR(di);
@@ -2226,10 +2819,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
-	btrfs_release_path(log, path);
-	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+	btrfs_release_path(path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
 					 index, name, name_len, -1);
 	if (IS_ERR(di)) {
 		err = PTR_ERR(di);
@@ -2238,7 +2834,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 
 	/* update the directory size in the log to reflect the names
@@ -2247,10 +2846,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	if (bytes_del) {
 		struct btrfs_key key;
 
-		key.objectid = dir->i_ino;
+		key.objectid = dir_ino;
 		key.offset = 0;
 		key.type = BTRFS_INODE_ITEM_KEY;
-		btrfs_release_path(log, path);
+		btrfs_release_path(path);
 
 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
 		if (ret < 0) {
@@ -2272,15 +2871,18 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 		} else
 			ret = 0;
-		btrfs_release_path(log, path);
+		btrfs_release_path(path);
 	}
 fail:
 	btrfs_free_path(path);
+out_unlock:
 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
 	if (ret == -ENOSPC) {
-		root->fs_info->last_trans_log_full_commit = trans->transid;
+		btrfs_set_log_full_commit(root->fs_info, trans);
 		ret = 0;
-	}
+	} else if (ret < 0)
+		btrfs_abort_transaction(trans, root, ret);
+
 	btrfs_end_log_trans(root);
 
 	return err;
@@ -2305,13 +2907,14 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	log = root->log_root;
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
-	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
 				  dirid, &index);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 	if (ret == -ENOSPC) {
-		root->fs_info->last_trans_log_full_commit = trans->transid;
+		btrfs_set_log_full_commit(root->fs_info, trans);
 		ret = 0;
-	}
+	} else if (ret < 0 && ret != -ENOENT)
+		btrfs_abort_transaction(trans, root, ret);
 	btrfs_end_log_trans(root);
 
 	return ret;
@@ -2346,7 +2949,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 			      struct btrfs_dir_log_item);
 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	btrfs_release_path(log, path);
+	btrfs_release_path(path);
 	return 0;
 }
 
@@ -2362,7 +2965,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			  u64 min_offset, u64 *last_offset_ret)
 {
 	struct btrfs_key min_key;
-	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src;
 	int err = 0;
@@ -2371,37 +2973,33 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	int nritems;
 	u64 first_offset = min_offset;
 	u64 last_offset = (u64)-1;
+	u64 ino = btrfs_ino(inode);
 
 	log = root->log_root;
-	max_key.objectid = inode->i_ino;
-	max_key.offset = (u64)-1;
-	max_key.type = key_type;
 
-	min_key.objectid = inode->i_ino;
+	min_key.objectid = ino;
 	min_key.type = key_type;
 	min_key.offset = min_offset;
 
 	path->keep_locks = 1;
 
-	ret = btrfs_search_forward(root, &min_key, &max_key,
-				   path, 0, trans->transid);
+	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
 
 	/*
 	 * we didn't find anything from this transaction, see if there
 	 * is anything at all
 	 */
-	if (ret != 0 || min_key.objectid != inode->i_ino ||
-	    min_key.type != key_type) {
-		min_key.objectid = inode->i_ino;
+	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+		min_key.objectid = ino;
 		min_key.type = key_type;
 		min_key.offset = (u64)-1;
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
 		if (ret < 0) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			return ret;
 		}
-		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+		ret = btrfs_previous_item(root, path, ino, key_type);
 
 		/* if ret == 0 there are items for this type,
 		 * create a range to tell us the last key of this type.
@@ -2419,7 +3017,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	}
 
 	/* go backward to find any previous key */
-	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+	ret = btrfs_previous_item(root, path, ino, key_type);
 	if (ret == 0) {
 		struct btrfs_key tmp;
 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -2434,14 +3032,12 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			}
 		}
 	}
-	btrfs_release_path(root, path);
+	btrfs_release_path(path);
 
 	/* find the first key from this transaction again */
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (ret != 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret != 0))
 		goto done;
-	}
 
 	/*
 	 * we have a block from this transaction, log every item in it
@@ -2454,8 +3050,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		for (i = path->slots[0]; i < nritems; i++) {
 			btrfs_item_key_to_cpu(src, &min_key, i);
 
-			if (min_key.objectid != inode->i_ino ||
-			    min_key.type != key_type)
+			if (min_key.objectid != ino || min_key.type != key_type)
 				goto done;
 			ret = overwrite_item(trans, log, dst_path, src, i,
 					     &min_key);
@@ -2476,7 +3071,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			goto done;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+		if (tmp.objectid != ino || tmp.type != key_type) {
 			last_offset = (u64)-1;
 			goto done;
 		}
@@ -2492,8 +3087,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		}
 	}
 done:
-	btrfs_release_path(root, path);
-	btrfs_release_path(log, dst_path);
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
 
 	if (err == 0) {
 		*last_offset_ret = last_offset;
@@ -2502,8 +3097,7 @@ done:
 		 * is valid
 		 */
 		ret = insert_dir_log_key(trans, log, path, key_type,
-					 inode->i_ino, first_offset,
-					 last_offset);
+					 ino, first_offset, last_offset);
 		if (ret)
 			err = ret;
 	}
@@ -2567,6 +3161,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	int start_slot;
 
 	key.objectid = objectid;
 	key.type = max_key_type;
@@ -2574,7 +3169,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-		BUG_ON(ret == 0);
+		BUG_ON(ret == 0); /* Logic error */
 		if (ret < 0)
 			break;
 
@@ -2588,30 +3183,123 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 		if (found_key.objectid != objectid)
 			break;
 
-		ret = btrfs_del_item(trans, log, path);
-		BUG_ON(ret);
-		btrfs_release_path(log, path);
+		found_key.offset = 0;
+		found_key.type = 0;
+		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+				       &start_slot);
+
+		ret = btrfs_del_items(trans, log, path, start_slot,
+				      path->slots[0] - start_slot + 1);
+		/*
+		 * If start slot isn't 0 then we don't need to re-search, we've
+		 * found the last guy with the objectid in this tree.
+		 */
+		if (ret || start_slot != 0)
+			break;
+		btrfs_release_path(path);
 	}
-	btrfs_release_path(log, path);
+	btrfs_release_path(path);
+	if (ret > 0)
+		ret = 0;
 	return ret;
 }
 
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode, int log_inode_only)
+{
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	if (log_inode_only) {
+		/* set the generation to zero so the recover code
+		 * can tell the difference between an logging
+		 * just to say 'this inode exists' and a logging
+		 * to say 'update this inode with these values'
+		 */
+		btrfs_set_token_inode_generation(leaf, item, 0, &token);
+		btrfs_set_token_inode_size(leaf, item, 0, &token);
+	} else {
+		btrfs_set_token_inode_generation(leaf, item,
+						 BTRFS_I(inode)->generation,
+						 &token);
+		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+	}
+
+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+				      inode->i_atime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				      inode->i_mtime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				      inode->i_ctime.tv_nsec, &token);
+
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+
+static int log_inode_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log, struct btrfs_path *path,
+			  struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	int ret;
+
+	ret = btrfs_insert_empty_item(trans, log, path,
+				      &BTRFS_I(inode)->location,
+				      sizeof(*inode_item));
+	if (ret && ret != -EEXIST)
+		return ret;
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_inode_item);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+	btrfs_release_path(path);
+	return 0;
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *log,
+			       struct inode *inode,
 			       struct btrfs_path *dst_path,
-			       struct extent_buffer *src,
+			       struct btrfs_path *src_path, u64 *last_extent,
 			       int start_slot, int nr, int inode_only)
 {
 	unsigned long src_offset;
 	unsigned long dst_offset;
+	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *src = src_path->nodes[0];
+	struct btrfs_key first_key, last_key, key;
 	int ret;
 	struct btrfs_key *ins_keys;
 	u32 *ins_sizes;
 	char *ins_data;
 	int i;
 	struct list_head ordered_sums;
+	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	bool has_extents = false;
+	bool need_find_last_extent = (*last_extent == 0);
+	bool done = false;
 
 	INIT_LIST_HEAD(&ordered_sums);
 
@@ -2620,6 +3308,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	if (!ins_data)
 		return -ENOMEM;
 
+	first_key.objectid = (u64)-1;
+
 	ins_sizes = (u32 *)ins_data;
 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
 
@@ -2640,36 +3330,50 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
-		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-				   src_offset, ins_sizes[i]);
+		if ((i == (nr - 1)))
+			last_key = ins_keys[i];
 
-		if (inode_only == LOG_INODE_EXISTS &&
-		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
 						    dst_path->slots[0],
 						    struct btrfs_inode_item);
-			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+			fill_inode_item(trans, dst_path->nodes[0], inode_item,
+					inode, inode_only == LOG_INODE_EXISTS);
+		} else {
+			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+					   src_offset, ins_sizes[i]);
+		}
 
-			/* set the generation to zero so the recover code
-			 * can tell the difference between an logging
-			 * just to say 'this inode exists' and a logging
-			 * to say 'update this inode with these values'
-			 */
-			btrfs_set_inode_generation(dst_path->nodes[0],
-						   inode_item, 0);
+		/*
+		 * We set need_find_last_extent here in case we know we were
+		 * processing other items and then walk into the first extent in
+		 * the inode.  If we don't hit an extent then nothing changes,
+		 * we'll do the last search the next time around.
+		 */
+		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+			has_extents = true;
+			if (need_find_last_extent &&
+			    first_key.objectid == (u64)-1)
+				first_key = ins_keys[i];
+		} else {
+			need_find_last_extent = false;
 		}
+
 		/* take a reference on file data extents so that truncates
 		 * or deletes of this inode don't have to relog the inode
 		 * again
 		 */
-		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+		    !skip_csum) {
 			int found_type;
 			extent = btrfs_item_ptr(src, start_slot + i,
 						struct btrfs_file_extent_item);
 
+			if (btrfs_file_extent_generation(src, extent) < trans->transid)
+				continue;
+
 			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG ||
-			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				u64 ds, dl, cs, cl;
 				ds = btrfs_file_extent_disk_bytenr(src,
 								extent);
@@ -2691,14 +3395,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				ret = btrfs_lookup_csums_range(
 						log->fs_info->csum_root,
 						ds + cs, ds + cs + cl - 1,
-						&ordered_sums);
-				BUG_ON(ret);
+						&ordered_sums, 0);
+				if (ret) {
+					btrfs_release_path(dst_path);
+					kfree(ins_data);
+					return ret;
+				}
 			}
 		}
 	}
 
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-	btrfs_release_path(log, dst_path);
+	btrfs_release_path(dst_path);
 	kfree(ins_data);
 
 	/*
@@ -2715,6 +3423,414 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 		list_del(&sums->list);
 		kfree(sums);
 	}
+
+	if (!has_extents)
+		return ret;
+
+	/*
+	 * Because we use btrfs_search_forward we could skip leaves that were
+	 * not modified and then assume *last_extent is valid when it really
+	 * isn't.  So back up to the previous leaf and read the end of the last
+	 * extent before we go and fill in holes.
+	 */
+	if (need_find_last_extent) {
+		u64 len;
+
+		ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			goto fill_holes;
+		if (src_path->slots[0])
+			src_path->slots[0]--;
+		src = src_path->nodes[0];
+		btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			goto fill_holes;
+		extent = btrfs_item_ptr(src, src_path->slots[0],
+					struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src,
+							   src_path->slots[0],
+							   extent);
+			*last_extent = ALIGN(key.offset + len,
+					     log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			*last_extent = key.offset + len;
+		}
+	}
+fill_holes:
+	/* So we did prev_leaf, now we need to move to the next leaf, but a few
+	 * things could have happened
+	 *
+	 * 1) A merge could have happened, so we could currently be on a leaf
+	 * that holds what we were copying in the first place.
+	 * 2) A split could have happened, and now not all of the items we want
+	 * are on the same leaf.
+	 *
+	 * So we need to adjust how we search for holes, we need to drop the
+	 * path and re-search for the first extent key we found, and then walk
+	 * forward until we hit the last one we copied.
+	 */
+	if (need_find_last_extent) {
+		/* btrfs_prev_leaf could return 1 without releasing the path */
+		btrfs_release_path(src_path);
+		ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
+					src_path, 0, 0);
+		if (ret < 0)
+			return ret;
+		ASSERT(ret == 0);
+		src = src_path->nodes[0];
+		i = src_path->slots[0];
+	} else {
+		i = start_slot;
+	}
+
+	/*
+	 * Ok so here we need to go through and fill in any holes we may have
+	 * to make sure that holes are punched for those areas in case they had
+	 * extents previously.
+	 */
+	while (!done) {
+		u64 offset, len;
+		u64 extent_end;
+
+		if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+			ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
+			if (ret < 0)
+				return ret;
+			ASSERT(ret == 0);
+			src = src_path->nodes[0];
+			i = 0;
+		}
+
+		btrfs_item_key_to_cpu(src, &key, i);
+		if (!btrfs_comp_cpu_keys(&key, &last_key))
+			done = true;
+		if (key.objectid != btrfs_ino(inode) ||
+		    key.type != BTRFS_EXTENT_DATA_KEY) {
+			i++;
+			continue;
+		}
+		extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(src, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(src, i, extent);
+			extent_end = ALIGN(key.offset + len, log->sectorsize);
+		} else {
+			len = btrfs_file_extent_num_bytes(src, extent);
+			extent_end = key.offset + len;
+		}
+		i++;
+
+		if (*last_extent == key.offset) {
+			*last_extent = extent_end;
+			continue;
+		}
+		offset = *last_extent;
+		len = key.offset - *last_extent;
+		ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+					       offset, 0, 0, len, 0, len, 0,
+					       0, 0);
+		if (ret)
+			break;
+		*last_extent = offset + len;
+	}
+	/*
+	 * Need to let the callers know we dropped the path so they should
+	 * re-search.
+	 */
+	if (!ret && need_find_last_extent)
+		ret = 1;
+	return ret;
+}
+
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct extent_map *em1, *em2;
+
+	em1 = list_entry(a, struct extent_map, list);
+	em2 = list_entry(b, struct extent_map, list);
+
+	if (em1->start < em2->start)
+		return -1;
+	else if (em1->start > em2->start)
+		return 1;
+	return 0;
+}
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+			  struct inode *inode, struct btrfs_root *root,
+			  struct extent_map *em, struct btrfs_path *path,
+			  struct list_head *logged_list)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	struct btrfs_ordered_extent *ordered;
+	struct list_head ordered_sums;
+	struct btrfs_map_token token;
+	struct btrfs_key key;
+	u64 mod_start = em->mod_start;
+	u64 mod_len = em->mod_len;
+	u64 csum_offset;
+	u64 csum_len;
+	u64 extent_offset = em->start - em->orig_start;
+	u64 block_len;
+	int ret;
+	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	int extent_inserted = 0;
+
+	INIT_LIST_HEAD(&ordered_sums);
+	btrfs_init_map_token(&token);
+
+	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+				   em->start + em->len, NULL, 0, 1,
+				   sizeof(*fi), &extent_inserted);
+	if (ret)
+		return ret;
+
+	if (!extent_inserted) {
+		key.objectid = btrfs_ino(inode);
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = em->start;
+
+		ret = btrfs_insert_empty_item(trans, log, path, &key,
+					      sizeof(*fi));
+		if (ret)
+			return ret;
+	}
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+					       &token);
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		skip_csum = true;
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_PREALLOC,
+						 &token);
+	} else {
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_REG,
+						 &token);
+		if (em->block_start == EXTENT_MAP_HOLE)
+			skip_csum = true;
+	}
+
+	block_len = max(em->block_len, em->orig_block_len);
+	if (em->compress_type != BTRFS_COMPRESS_NONE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start,
+							&token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start -
+							extent_offset, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+							   &token);
+	}
+
+	btrfs_set_token_file_extent_offset(leaf, fi,
+					   em->start - em->orig_start,
+					   &token);
+	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+						&token);
+	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_release_path(path);
+	if (ret) {
+		return ret;
+	}
+
+	if (skip_csum)
+		return 0;
+
+	/*
+	 * First check and see if our csums are on our outstanding ordered
+	 * extents.
+	 */
+	list_for_each_entry(ordered, logged_list, log_list) {
+		struct btrfs_ordered_sum *sum;
+
+		if (!mod_len)
+			break;
+
+		if (ordered->file_offset + ordered->len <= mod_start ||
+		    mod_start + mod_len <= ordered->file_offset)
+			continue;
+
+		/*
+		 * We are going to copy all the csums on this ordered extent, so
+		 * go ahead and adjust mod_start and mod_len in case this
+		 * ordered extent has already been logged.
+		 */
+		if (ordered->file_offset > mod_start) {
+			if (ordered->file_offset + ordered->len >=
+			    mod_start + mod_len)
+				mod_len = ordered->file_offset - mod_start;
+			/*
+			 * If we have this case
+			 *
+			 * |--------- logged extent ---------|
+			 *       |----- ordered extent ----|
+			 *
+			 * Just don't mess with mod_start and mod_len, we'll
+			 * just end up logging more csums than we need and it
+			 * will be ok.
+			 */
+		} else {
+			if (ordered->file_offset + ordered->len <
+			    mod_start + mod_len) {
+				mod_len = (mod_start + mod_len) -
+					(ordered->file_offset + ordered->len);
+				mod_start = ordered->file_offset +
+					ordered->len;
+			} else {
+				mod_len = 0;
+			}
+		}
+
+		/*
+		 * To keep us from looping for the above case of an ordered
+		 * extent that falls inside of the logged extent.
+		 */
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+				     &ordered->flags))
+			continue;
+
+		if (ordered->csum_bytes_left) {
+			btrfs_start_ordered_extent(inode, ordered, 0);
+			wait_event(ordered->wait,
+				   ordered->csum_bytes_left == 0);
+		}
+
+		list_for_each_entry(sum, &ordered->list, list) {
+			ret = btrfs_csum_file_blocks(trans, log, sum);
+			if (ret)
+				goto unlocked;
+		}
+
+	}
+unlocked:
+
+	if (!mod_len || ret)
+		return ret;
+
+	if (em->compress_type) {
+		csum_offset = 0;
+		csum_len = block_len;
+	} else {
+		csum_offset = mod_start - em->start;
+		csum_len = mod_len;
+	}
+
+	/* block start is already adjusted for the file extent offset. */
+	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+				       em->block_start + csum_offset,
+				       em->block_start + csum_offset +
+				       csum_len - 1, &ordered_sums, 0);
+	if (ret)
+		return ret;
+
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+
+	return ret;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *inode,
+				     struct btrfs_path *path,
+				     struct list_head *logged_list)
+{
+	struct extent_map *em, *n;
+	struct list_head extents;
+	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+	u64 test_gen;
+	int ret = 0;
+	int num = 0;
+
+	INIT_LIST_HEAD(&extents);
+
+	write_lock(&tree->lock);
+	test_gen = root->fs_info->last_trans_committed;
+
+	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+		list_del_init(&em->list);
+
+		/*
+		 * Just an arbitrary number, this can be really CPU intensive
+		 * once we start getting a lot of extents, and really once we
+		 * have a bunch of extents we just want to commit since it will
+		 * be faster.
+		 */
+		if (++num > 32768) {
+			list_del_init(&tree->modified_extents);
+			ret = -EFBIG;
+			goto process;
+		}
+
+		if (em->generation <= test_gen)
+			continue;
+		/* Need a ref to keep it from getting evicted from cache */
+		atomic_inc(&em->refs);
+		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		list_add_tail(&em->list, &extents);
+		num++;
+	}
+
+	list_sort(NULL, &extents, extent_cmp);
+
+process:
+	while (!list_empty(&extents)) {
+		em = list_entry(extents.next, struct extent_map, list);
+
+		list_del_init(&em->list);
+
+		/*
+		 * If we had an error we just need to delete everybody from our
+		 * private list.
+		 */
+		if (ret) {
+			clear_em_logging(tree, em);
+			free_extent_map(em);
+			continue;
+		}
+
+		write_unlock(&tree->lock);
+
+		ret = log_one_extent(trans, inode, root, em, path, logged_list);
+		write_lock(&tree->lock);
+		clear_em_logging(tree, em);
+		free_extent_map(em);
+	}
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
+
+	btrfs_release_path(path);
 	return ret;
 }
 
@@ -2742,13 +3858,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
+	LIST_HEAD(logged_list);
+	u64 last_extent = 0;
 	int err = 0;
 	int ret;
 	int nritems;
 	int ins_start_slot = 0;
 	int ins_nr;
-
-	log = root->log_root;
+	bool fast_search = false;
+	u64 ino = btrfs_ino(inode);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -2759,24 +3877,38 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	min_key.objectid = inode->i_ino;
+	min_key.objectid = ino;
 	min_key.type = BTRFS_INODE_ITEM_KEY;
 	min_key.offset = 0;
 
-	max_key.objectid = inode->i_ino;
+	max_key.objectid = ino;
 
-	/* today the code can only do partial logging of directories */
-	if (!S_ISDIR(inode->i_mode))
-	    inode_only = LOG_INODE_ALL;
 
-	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+	/* today the code can only do partial logging of directories */
+	if (S_ISDIR(inode->i_mode) ||
+	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		       &BTRFS_I(inode)->runtime_flags) &&
+	     inode_only == LOG_INODE_EXISTS))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
+	/* Only run delayed items if we are a dir or a new file */
+	if (S_ISDIR(inode->i_mode) ||
+	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+		ret = btrfs_commit_inode_delayed_items(trans, inode);
+		if (ret) {
+			btrfs_free_path(path);
+			btrfs_free_path(dst_path);
+			return ret;
+		}
+	}
+
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
+	btrfs_get_logged_extents(inode, &logged_list);
+
 	/*
 	 * a brute force approach to making sure we get the most uptodate
 	 * copies of everything.
@@ -2786,10 +3918,33 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 		if (inode_only == LOG_INODE_EXISTS)
 			max_key_type = BTRFS_XATTR_ITEM_KEY;
-		ret = drop_objectid_items(trans, log, path,
-					  inode->i_ino, max_key_type);
+		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
 	} else {
-		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				       &BTRFS_I(inode)->runtime_flags)) {
+			clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+				  &BTRFS_I(inode)->runtime_flags);
+			ret = btrfs_truncate_inode_items(trans, log,
+							 inode, 0, 0);
+		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+					      &BTRFS_I(inode)->runtime_flags) ||
+			   inode_only == LOG_INODE_EXISTS) {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			max_key.type = BTRFS_XATTR_ITEM_KEY;
+			ret = drop_objectid_items(trans, log, path, ino,
+						  max_key.type);
+		} else {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			ret = log_inode_item(trans, log, dst_path, inode);
+			if (ret) {
+				err = ret;
+				goto out_unlock;
+			}
+			goto log_extents;
+		}
+
 	}
 	if (ret) {
 		err = ret;
@@ -2799,13 +3954,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ins_nr = 0;
-		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, 0, trans->transid);
+		ret = btrfs_search_forward(root, &min_key,
+					   path, trans->transid);
 		if (ret != 0)
 			break;
 again:
 		/* note, ins_nr might be > 0 here, cleanup outside the loop */
-		if (min_key.objectid != inode->i_ino)
+		if (min_key.objectid != ino)
 			break;
 		if (min_key.type > max_key.type)
 			break;
@@ -2820,11 +3975,15 @@ again:
 			goto next_slot;
 		}
 
-		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
-				 ins_nr, inode_only);
-		if (ret) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 ins_start_slot, ins_nr, inode_only);
+		if (ret < 0) {
 			err = ret;
 			goto out_unlock;
+		} if (ret) {
+			ins_nr = 0;
+			btrfs_release_path(path);
+			continue;
 		}
 		ins_nr = 1;
 		ins_start_slot = path->slots[0];
@@ -2838,40 +3997,59 @@ next_slot:
 			goto again;
 		}
 		if (ins_nr) {
-			ret = copy_items(trans, log, dst_path, src,
-					 ins_start_slot,
+			ret = copy_items(trans, inode, dst_path, path,
+					 &last_extent, ins_start_slot,
 					 ins_nr, inode_only);
-			if (ret) {
+			if (ret < 0) {
 				err = ret;
 				goto out_unlock;
 			}
+			ret = 0;
 			ins_nr = 0;
 		}
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 
-		if (min_key.offset < (u64)-1)
+		if (min_key.offset < (u64)-1) {
 			min_key.offset++;
-		else if (min_key.type < (u8)-1)
+		} else if (min_key.type < max_key.type) {
 			min_key.type++;
-		else if (min_key.objectid < (u64)-1)
-			min_key.objectid++;
-		else
+			min_key.offset = 0;
+		} else {
 			break;
+		}
 	}
 	if (ins_nr) {
-		ret = copy_items(trans, log, dst_path, src,
-				 ins_start_slot,
-				 ins_nr, inode_only);
-		if (ret) {
+		ret = copy_items(trans, inode, dst_path, path, &last_extent,
+				 ins_start_slot, ins_nr, inode_only);
+		if (ret < 0) {
 			err = ret;
 			goto out_unlock;
 		}
+		ret = 0;
 		ins_nr = 0;
 	}
-	WARN_ON(ins_nr);
+
+log_extents:
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
+	if (fast_search) {
+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+						&logged_list);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	} else if (inode_only == LOG_INODE_ALL) {
+		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_map *em, *n;
+
+		write_lock(&tree->lock);
+		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+			list_del_init(&em->list);
+		write_unlock(&tree->lock);
+	}
+
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-		btrfs_release_path(root, path);
-		btrfs_release_path(log, dst_path);
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		if (ret) {
 			err = ret;
@@ -2879,7 +4057,12 @@ next_slot:
 		}
 	}
 	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
+	if (unlikely(err))
+		btrfs_put_logged_extents(&logged_list);
+	else
+		btrfs_submit_logged_extents(&logged_list, log);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
@@ -2902,6 +4085,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	struct btrfs_root *root;
 	struct dentry *old_parent = NULL;
+	struct inode *orig_inode = inode;
 
 	/*
 	 * for regular files, if its inode is already on disk, we don't
@@ -2921,7 +4105,14 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	}
 
 	while (1) {
-		BTRFS_I(inode)->logged_trans = trans->transid;
+		/*
+		 * If we are logging a directory then we start with our inode,
+		 * not our parents inode, so we need to skipp setting the
+		 * logged_trans so that further down in the log code we don't
+		 * think this inode has already been logged.
+		 */
+		if (inode != orig_inode)
+			BTRFS_I(inode)->logged_trans = trans->transid;
 		smp_mb();
 
 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
@@ -2931,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 			 * make sure any commits to the log are forced
 			 * to be full commits
 			 */
-			root->fs_info->last_trans_log_full_commit =
-				trans->transid;
+			btrfs_set_log_full_commit(root->fs_info, trans);
 			ret = 1;
 			break;
 		}
@@ -2954,30 +4144,16 @@ out:
 	return ret;
 }
 
-static int inode_in_log(struct btrfs_trans_handle *trans,
-		 struct inode *inode)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
-
-	mutex_lock(&root->log_mutex);
-	if (BTRFS_I(inode)->logged_trans == trans->transid &&
-	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-		ret = 1;
-	mutex_unlock(&root->log_mutex);
-	return ret;
-}
-
-
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only)
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+			    	  struct btrfs_root *root, struct inode *inode,
+			    	  struct dentry *parent, int exists_only,
+				  struct btrfs_log_ctx *ctx)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
@@ -2992,6 +4168,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_no_trans;
 	}
 
+	/*
+	 * The prev transaction commit doesn't complete, we need do
+	 * full commit by ourselves.
+	 */
 	if (root->fs_info->last_trans_log_full_commit >
 	    root->fs_info->last_trans_committed) {
 		ret = 1;
@@ -3009,14 +4189,14 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto end_no_trans;
 
-	if (inode_in_log(trans, inode)) {
+	if (btrfs_inode_in_log(inode, trans->transid)) {
 		ret = BTRFS_NO_LOG_SYNC;
 		goto end_no_trans;
 	}
 
-	ret = start_log_trans(trans, root);
+	ret = start_log_trans(trans, root, ctx);
 	if (ret)
-		goto end_trans;
+		goto end_no_trans;
 
 	ret = btrfs_log_inode(trans, root, inode, inode_only);
 	if (ret)
@@ -3061,10 +4241,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
 	dput(old_parent);
 	if (ret < 0) {
-		BUG_ON(ret != -ENOSPC);
-		root->fs_info->last_trans_log_full_commit = trans->transid;
+		btrfs_set_log_full_commit(root->fs_info, trans);
 		ret = 1;
 	}
+
+	if (ret)
+		btrfs_remove_log_ctx(root, ctx);
 	btrfs_end_log_trans(root);
 end_no_trans:
 	return ret;
@@ -3077,12 +4259,14 @@ end_no_trans:
  * data on disk.
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry)
+			  struct btrfs_root *root, struct dentry *dentry,
+			  struct btrfs_log_ctx *ctx)
 {
 	struct dentry *parent = dget_parent(dentry);
 	int ret;
 
-	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+				     0, ctx);
 	dput(parent);
 
 	return ret;
@@ -3107,17 +4291,27 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		.stage = 0,
 	};
 
-	fs_info->log_root_recovering = 1;
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
+	fs_info->log_root_recovering = 1;
 
 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto error;
+	}
 
 	wc.trans = trans;
 	wc.pin = 1;
 
-	walk_log_tree(trans, log_root_tree, &wc);
+	ret = walk_log_tree(trans, log_root_tree, &wc);
+	if (ret) {
+		btrfs_error(fs_info, ret, "Failed to pin buffers while "
+			    "recovering log root tree.");
+		goto error;
+	}
 
 again:
 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3126,8 +4320,12 @@ again:
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
-		if (ret < 0)
-			break;
+
+		if (ret < 0) {
+			btrfs_error(fs_info, ret,
+				    "Couldn't find tree log root.");
+			goto error;
+		}
 		if (ret > 0) {
 			if (path->slots[0] == 0)
 				break;
@@ -3135,31 +4333,40 @@ again:
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		btrfs_release_path(log_root_tree, path);
+		btrfs_release_path(path);
 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 			break;
 
-		log = btrfs_read_fs_root_no_radix(log_root_tree,
-						  &found_key);
-		BUG_ON(!log);
-
+		log = btrfs_read_fs_root(log_root_tree, &found_key);
+		if (IS_ERR(log)) {
+			ret = PTR_ERR(log);
+			btrfs_error(fs_info, ret,
+				    "Couldn't read tree log root.");
+			goto error;
+		}
 
 		tmp_key.objectid = found_key.offset;
 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
 		tmp_key.offset = (u64)-1;
 
 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
-		BUG_ON(!wc.replay_dest);
+		if (IS_ERR(wc.replay_dest)) {
+			ret = PTR_ERR(wc.replay_dest);
+			free_extent_buffer(log->node);
+			free_extent_buffer(log->commit_root);
+			kfree(log);
+			btrfs_error(fs_info, ret, "Couldn't read target root "
+				    "for tree log recovery.");
+			goto error;
+		}
 
 		wc.replay_dest->log_root = log;
 		btrfs_record_root_in_trans(trans, wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
-		BUG_ON(ret);
 
-		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
 						      path);
-			BUG_ON(ret);
 		}
 
 		key.offset = found_key.offset - 1;
@@ -3168,10 +4375,13 @@ again:
 		free_extent_buffer(log->commit_root);
 		kfree(log);
 
+		if (ret)
+			goto error;
+
 		if (found_key.offset == 0)
 			break;
 	}
-	btrfs_release_path(log_root_tree, path);
+	btrfs_release_path(path);
 
 	/* step one is to pin it all, step two is to replay just inodes */
 	if (wc.pin) {
@@ -3188,15 +4398,22 @@ again:
 
 	btrfs_free_path(path);
 
+	/* step 4: commit the transaction, which also unpins the blocks */
+	ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+	if (ret)
+		return ret;
+
 	free_extent_buffer(log_root_tree->node);
 	log_root_tree->log_root = NULL;
 	fs_info->log_root_recovering = 0;
-
-	/* step 4: commit the transaction, which also unpins the blocks */
-	btrfs_commit_transaction(trans, fs_info->tree_root);
-
 	kfree(log_root_tree);
+
 	return 0;
+error:
+	if (wc.trans)
+		btrfs_end_transaction(wc.trans, fs_info->tree_root);
+	btrfs_free_path(path);
+	return ret;
 }
 
 /*
@@ -3286,6 +4503,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		    root->fs_info->last_trans_committed))
 		return 0;
 
-	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 3dfae84c8cc..7f5b41bd537 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,17 +19,47 @@
 #ifndef __TREE_LOG_
 #define __TREE_LOG_
 
+#include "ctree.h"
+#include "transaction.h"
+
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
+struct btrfs_log_ctx {
+	int log_ret;
+	int log_transid;
+	struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+	ctx->log_ret = 0;
+	ctx->log_transid = 0;
+	INIT_LIST_HEAD(&ctx->list);
+}
+
+static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
+					     struct btrfs_trans_handle *trans)
+{
+	ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
+					     struct btrfs_trans_handle *trans)
+{
+	return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
+		trans->transid;
+}
+
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-		   struct btrfs_root *root);
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry);
+			  struct btrfs_root *root, struct dentry *dentry,
+			  struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 const char *name, int name_len,
@@ -38,12 +68,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       const char *name, int name_len,
 			       struct inode *inode, u64 dirid);
-int btrfs_join_running_log_trans(struct btrfs_root *root);
-int btrfs_end_log_trans(struct btrfs_root *root);
+void btrfs_end_log_trans(struct btrfs_root *root);
 int btrfs_pin_log_trans(struct btrfs_root *root);
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct inode *dir, struct inode *inode,
 			     int for_rename);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 00000000000..840a38b2778
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include "ulist.h"
+#include "ctree.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * ULIST_ITER_INIT(&uiter);
+ *
+ * while ((elem = ulist_next(ulist, &uiter)) {
+ * 	for (all child nodes n in elem)
+ *		ulist_add(ulist, n);
+ *	do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:	the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+	INIT_LIST_HEAD(&ulist->nodes);
+	ulist->root = RB_ROOT;
+	ulist->nnodes = 0;
+}
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:	the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+static void ulist_fini(struct ulist *ulist)
+{
+	struct ulist_node *node;
+	struct ulist_node *next;
+
+	list_for_each_entry_safe(node, next, &ulist->nodes, list) {
+		kfree(node);
+	}
+	ulist->root = RB_ROOT;
+	INIT_LIST_HEAD(&ulist->nodes);
+}
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:	ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+	ulist_fini(ulist);
+	ulist_init(ulist);
+}
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:	allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(gfp_t gfp_mask)
+{
+	struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+	if (!ulist)
+		return NULL;
+
+	ulist_init(ulist);
+
+	return ulist;
+}
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:	ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+	if (!ulist)
+		return;
+	ulist_fini(ulist);
+	kfree(ulist);
+}
+
+static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
+{
+	struct rb_node *n = ulist->root.rb_node;
+	struct ulist_node *u = NULL;
+
+	while (n) {
+		u = rb_entry(n, struct ulist_node, rb_node);
+		if (u->val < val)
+			n = n->rb_right;
+		else if (u->val > val)
+			n = n->rb_left;
+		else
+			return u;
+	}
+	return NULL;
+}
+
+static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
+{
+	struct rb_node **p = &ulist->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct ulist_node *cur = NULL;
+
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct ulist_node, rb_node);
+
+		if (cur->val < ins->val)
+			p = &(*p)->rb_right;
+		else if (cur->val > ins->val)
+			p = &(*p)->rb_left;
+		else
+			return -EEXIST;
+	}
+	rb_link_node(&ins->rb_node, parent, p);
+	rb_insert_color(&ins->rb_node, &ulist->root);
+	return 0;
+}
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:	ulist to add the element to
+ * @val:	value to add to ulist
+ * @aux:	auxiliary value to store along with val
+ * @gfp_mask:	flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
+{
+	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask)
+{
+	int ret;
+	struct ulist_node *node;
+
+	node = ulist_rbtree_search(ulist, val);
+	if (node) {
+		if (old_aux)
+			*old_aux = node->aux;
+		return 0;
+	}
+	node = kmalloc(sizeof(*node), gfp_mask);
+	if (!node)
+		return -ENOMEM;
+
+	node->val = val;
+	node->aux = aux;
+#ifdef CONFIG_BTRFS_DEBUG
+	node->seqnum = ulist->nnodes;
+#endif
+
+	ret = ulist_rbtree_insert(ulist, node);
+	ASSERT(!ret);
+	list_add_tail(&node->list, &ulist->nodes);
+	ulist->nnodes++;
+
+	return 1;
+}
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist:	ulist to iterate
+ * @uiter:	iterator variable, initialized with ULIST_ITER_INIT(&iterator)
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+{
+	struct ulist_node *node;
+
+	if (list_empty(&ulist->nodes))
+		return NULL;
+	if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
+		return NULL;
+	if (uiter->cur_list) {
+		uiter->cur_list = uiter->cur_list->next;
+	} else {
+		uiter->cur_list = ulist->nodes.next;
+#ifdef CONFIG_BTRFS_DEBUG
+		uiter->i = 0;
+#endif
+	}
+	node = list_entry(uiter->cur_list, struct ulist_node, list);
+#ifdef CONFIG_BTRFS_DEBUG
+	ASSERT(node->seqnum == uiter->i);
+	ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
+	uiter->i++;
+#endif
+	return node;
+}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 00000000000..7f78cbf5cf4
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ */
+struct ulist_iterator {
+#ifdef CONFIG_BTRFS_DEBUG
+	int i;
+#endif
+	struct list_head *cur_list;  /* hint to start search */
+};
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+	u64 val;		/* value to store */
+	u64 aux;		/* auxiliary value saved along with the val */
+
+#ifdef CONFIG_BTRFS_DEBUG
+	int seqnum;		/* sequence number this node is added */
+#endif
+
+	struct list_head list;  /* used to link node */
+	struct rb_node rb_node;	/* used to speed up search */
+};
+
+struct ulist {
+	/*
+	 * number of elements stored in list
+	 */
+	unsigned long nnodes;
+
+	struct list_head nodes;
+	struct rb_root root;
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist,
+			      struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
+
+#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
new file mode 100644
index 00000000000..f6a4c03ee7d
--- /dev/null
+++ b/fs/btrfs/uuid-tree.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) STRATO AG 2013.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/uuid.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+
+static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+{
+	key->type = type;
+	key->objectid = get_unaligned_le64(uuid);
+	key->offset = get_unaligned_le64(uuid + sizeof(u64));
+}
+
+/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+				  u8 type, u64 subid)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	unsigned long offset;
+	struct btrfs_key key;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+	ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	offset = btrfs_item_ptr_offset(eb, slot);
+	ret = -ENOENT;
+
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
+			(unsigned long)item_size);
+		goto out;
+	}
+	while (item_size) {
+		__le64 data;
+
+		read_extent_buffer(eb, &data, offset, sizeof(data));
+		if (le64_to_cpu(data) == subid) {
+			ret = 0;
+			break;
+		}
+		offset += sizeof(data);
+		item_size -= sizeof(data);
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid_cpu)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int slot;
+	unsigned long offset;
+	__le64 subid_le;
+
+	ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu);
+	if (ret != -ENOENT)
+		return ret;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
+				      sizeof(subid_le));
+	if (ret >= 0) {
+		/* Add an item for the type for the first time */
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		offset = btrfs_item_ptr_offset(eb, slot);
+	} else if (ret == -EEXIST) {
+		/*
+		 * An item with that type already exists.
+		 * Extend the item and store the new subid at the end.
+		 */
+		btrfs_extend_item(uuid_root, path, sizeof(subid_le));
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		offset = btrfs_item_ptr_offset(eb, slot);
+		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+	} else if (ret < 0) {
+		btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
+			"(0x%016llx, 0x%016llx) type %u!",
+			ret, (unsigned long long)key.objectid,
+			(unsigned long long)key.offset, type);
+		goto out;
+	}
+
+	ret = 0;
+	subid_le = cpu_to_le64(subid_cpu);
+	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int slot;
+	unsigned long offset;
+	u32 item_size;
+	unsigned long move_dst;
+	unsigned long move_src;
+	unsigned long move_len;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
+	if (ret < 0) {
+		btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
+			ret);
+		goto out;
+	}
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	offset = btrfs_item_ptr_offset(eb, slot);
+	item_size = btrfs_item_size_nr(eb, slot);
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
+			(unsigned long)item_size);
+		ret = -ENOENT;
+		goto out;
+	}
+	while (item_size) {
+		__le64 read_subid;
+
+		read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid));
+		if (le64_to_cpu(read_subid) == subid)
+			break;
+		offset += sizeof(read_subid);
+		item_size -= sizeof(read_subid);
+	}
+
+	if (!item_size) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	if (item_size == sizeof(subid)) {
+		ret = btrfs_del_item(trans, uuid_root, path);
+		goto out;
+	}
+
+	move_dst = offset;
+	move_src = offset + sizeof(subid);
+	move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
+	memmove_extent_buffer(eb, move_dst, move_src, move_len);
+	btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			       u64 subid)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/* 1 - for the uuid item */
+	trans = btrfs_start_transaction(uuid_root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid);
+	btrfs_end_transaction(trans, uuid_root);
+
+out:
+	return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64))
+{
+	struct btrfs_root *root = fs_info->uuid_root;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	int slot;
+	u32 item_size;
+	unsigned long offset;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+
+again_search_slot:
+	path->keep_locks = 1;
+	ret = btrfs_search_forward(root, &key, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto out;
+	}
+
+	while (1) {
+		cond_resched();
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.type != BTRFS_UUID_KEY_SUBVOL &&
+		    key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+			goto skip;
+
+		offset = btrfs_item_ptr_offset(leaf, slot);
+		item_size = btrfs_item_size_nr(leaf, slot);
+		if (!IS_ALIGNED(item_size, sizeof(u64))) {
+			btrfs_warn(fs_info, "uuid item with illegal size %lu!",
+				(unsigned long)item_size);
+			goto skip;
+		}
+		while (item_size) {
+			u8 uuid[BTRFS_UUID_SIZE];
+			__le64 subid_le;
+			u64 subid_cpu;
+
+			put_unaligned_le64(key.objectid, uuid);
+			put_unaligned_le64(key.offset, uuid + sizeof(u64));
+			read_extent_buffer(leaf, &subid_le, offset,
+					   sizeof(subid_le));
+			subid_cpu = le64_to_cpu(subid_le);
+			ret = check_func(fs_info, uuid, key.type, subid_cpu);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				btrfs_release_path(path);
+				ret = btrfs_uuid_iter_rem(root, uuid, key.type,
+							  subid_cpu);
+				if (ret == 0) {
+					/*
+					 * this might look inefficient, but the
+					 * justification is that it is an
+					 * exception that check_func returns 1,
+					 * and that in the regular case only one
+					 * entry per UUID exists.
+					 */
+					goto again_search_slot;
+				}
+				if (ret < 0 && ret != -ENOENT)
+					goto out;
+			}
+			item_size -= sizeof(subid_le);
+			offset += sizeof(subid_le);
+		}
+
+skip:
+		ret = btrfs_next_item(root, path);
+		if (ret == 0)
+			continue;
+		else if (ret > 0)
+			ret = 0;
+		break;
+	}
+
+out:
+	btrfs_free_path(path);
+	if (ret)
+		btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
+	return 0;
+}
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef..00000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
deleted file mode 100644
index 1ca1952fd91..00000000000
--- a/fs/btrfs/version.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# determine-version -- report a useful version for releases
-#
-# Copyright 2008, Aron Griffis <agriffis@n01se.net>
-# Copyright 2008, Oracle
-# Released under the GNU GPLv2
- 
-v="v0.16"
-
-which git &> /dev/null
-if [ $? == 0 ]; then
-    git branch >& /dev/null
-    if [ $? == 0 ]; then
-	    if head=`git rev-parse --verify HEAD 2>/dev/null`; then
-		if tag=`git describe --tags 2>/dev/null`; then
-		    v="$tag"
-		fi
-
-		# Are there uncommitted changes?
-		git update-index --refresh --unmerged > /dev/null
-		if git diff-index --name-only HEAD | \
-		    grep -v "^scripts/package" \
-		    | read dummy; then
-		    v="$v"-dirty
-		fi
-	    fi
-    fi
-fi
- 
-echo "#ifndef __BUILD_VERSION" > .build-version.h
-echo "#define __BUILD_VERSION" >> .build-version.h
-echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
-echo "#endif" >> .build-version.h
-
-diff -q version.h .build-version.h >& /dev/null
-
-if [ $? == 0 ]; then
-    rm .build-version.h
-    exit 0
-fi
-
-mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd13eb81ee4..6cb82f62cb7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,56 +23,86 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/semaphore.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "async-thread.h"
-
-struct map_lookup {
-	u64 type;
-	int io_align;
-	int io_width;
-	int stripe_len;
-	int sector_size;
-	int num_stripes;
-	int sub_stripes;
-	struct btrfs_bio_stripe stripes[];
-};
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
+#include "sysfs.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
-
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
-			    (sizeof(struct btrfs_bio_stripe) * (n)))
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
-void btrfs_lock_volumes(void)
+static void lock_chunks(struct btrfs_root *root)
 {
-	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 }
 
-void btrfs_unlock_volumes(void)
+static void unlock_chunks(struct btrfs_root *root)
 {
-	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
-static void lock_chunks(struct btrfs_root *root)
+static struct btrfs_fs_devices *__alloc_fs_devices(void)
 {
-	mutex_lock(&root->fs_info->chunk_mutex);
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+	if (!fs_devs)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fs_devs->device_list_mutex);
+
+	INIT_LIST_HEAD(&fs_devs->devices);
+	INIT_LIST_HEAD(&fs_devs->alloc_list);
+	INIT_LIST_HEAD(&fs_devs->list);
+
+	return fs_devs;
 }
 
-static void unlock_chunks(struct btrfs_root *root)
+/**
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
+ *		generated.
+ *
+ * Return: a pointer to a new &struct btrfs_fs_devices on success;
+ * ERR_PTR() on error.  Returned struct is not linked onto any lists and
+ * can be destroyed with kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 {
-	mutex_unlock(&root->fs_info->chunk_mutex);
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = __alloc_fs_devices();
+	if (IS_ERR(fs_devs))
+		return fs_devs;
+
+	if (fsid)
+		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+	else
+		generate_random_uuid(fs_devs->fsid);
+
+	return fs_devs;
 }
 
 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -83,13 +113,26 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(fs_devices->devices.next,
 				    struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
-		kfree(device->name);
+		rcu_string_free(device->name);
 		kfree(device);
 	}
 	kfree(fs_devices);
 }
 
-int btrfs_cleanup_fs_uuids(void)
+static void btrfs_kobject_uevent(struct block_device *bdev,
+				 enum kobject_action action)
+{
+	int ret;
+
+	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+	if (ret)
+		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+			action,
+			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+			&disk_to_dev(bdev->bd_disk)->kobj);
+}
+
+void btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
 
@@ -99,7 +142,27 @@ int btrfs_cleanup_fs_uuids(void)
 		list_del(&fs_devices->list);
 		free_fs_devices(fs_devices);
 	}
-	return 0;
+}
+
+static struct btrfs_device *__alloc_device(void)
+{
+	struct btrfs_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_NOFS);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dev->dev_list);
+	INIT_LIST_HEAD(&dev->dev_alloc_list);
+
+	spin_lock_init(&dev->io_lock);
+
+	spin_lock_init(&dev->reada_lock);
+	atomic_set(&dev->reada_in_flight, 0);
+	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
+	return dev;
 }
 
 static noinline struct btrfs_device *__find_device(struct list_head *head,
@@ -127,6 +190,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+		      int flush, struct block_device **bdev,
+		      struct buffer_head **bh)
+{
+	int ret;
+
+	*bdev = blkdev_get_by_path(device_path, flags, holder);
+
+	if (IS_ERR(*bdev)) {
+		ret = PTR_ERR(*bdev);
+		printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
+		goto error;
+	}
+
+	if (flush)
+		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+	ret = set_blocksize(*bdev, 4096);
+	if (ret) {
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+	invalidate_bdev(*bdev);
+	*bh = btrfs_read_dev_super(*bdev);
+	if (!*bh) {
+		ret = -EINVAL;
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+
+	return 0;
+
+error:
+	*bdev = NULL;
+	*bh = NULL;
+	return ret;
+}
+
 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 			struct bio *head, struct bio *tail)
 {
@@ -152,7 +253,7 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios,
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-static noinline int run_scheduled_bios(struct btrfs_device *device)
+static noinline void run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -162,22 +263,26 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	struct bio *cur;
 	int again = 0;
 	unsigned long num_run;
-	unsigned long num_sync_run;
 	unsigned long batch_run = 0;
 	unsigned long limit;
 	unsigned long last_waited = 0;
 	int force_reg = 0;
+	int sync_pending = 0;
+	struct blk_plug plug;
+
+	/*
+	 * this function runs all the bios we've collected for
+	 * a particular device.  We don't want to wander off to
+	 * another device without first sending all of these down.
+	 * So, setup a plug here and finish it off before we return
+	 */
+	blk_start_plug(&plug);
 
 	bdi = blk_get_backing_dev_info(device->bdev);
 	fs_info = device->dev_root->fs_info;
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
-	/* we want to make sure that every time we switch from the sync
-	 * list to the normal list, we unplug
-	 */
-	num_sync_run = 0;
-
 loop:
 	spin_lock(&device->io_lock);
 
@@ -223,15 +328,6 @@ loop_lock:
 
 	spin_unlock(&device->io_lock);
 
-	/*
-	 * if we're doing the regular priority list, make sure we unplug
-	 * for any high prio bios we've sent down
-	 */
-	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
-		num_sync_run = 0;
-		blk_run_backing_dev(bdi, NULL);
-	}
-
 	while (pending) {
 
 		rmb();
@@ -251,27 +347,34 @@ loop_lock:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&fs_info->nr_async_bios);
 
-		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 
-		if (cur->bi_rw & REQ_SYNC)
-			num_sync_run++;
+		/*
+		 * if we're doing the sync list, record that our
+		 * plug has some sync requests on it
+		 *
+		 * If we're doing the regular list and there are
+		 * sync requests sitting around, unplug before
+		 * we add more
+		 */
+		if (pending_bios == &device->pending_sync_bios) {
+			sync_pending = 1;
+		} else if (sync_pending) {
+			blk_finish_plug(&plug);
+			blk_start_plug(&plug);
+			sync_pending = 0;
+		}
 
-		submit_bio(cur->bi_rw, cur);
+		btrfsic_submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
-		if (need_resched()) {
-			if (num_sync_run) {
-				blk_run_backing_dev(bdi, NULL);
-				num_sync_run = 0;
-			}
+		if (need_resched())
 			cond_resched();
-		}
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -304,13 +407,8 @@ loop_lock:
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
-				if (need_resched()) {
-					if (num_sync_run) {
-						blk_run_backing_dev(bdi, NULL);
-						num_sync_run = 0;
-					}
+				if (need_resched())
 					cond_resched();
-				}
 				continue;
 			}
 			spin_lock(&device->io_lock);
@@ -318,27 +416,18 @@ loop_lock:
 			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
-			btrfs_requeue_work(&device->work);
+			btrfs_queue_work(fs_info->submit_workers,
+					 &device->work);
 			goto done;
 		}
+		/* unplug every 64 requests just for good measure */
+		if (batch_run % 64 == 0) {
+			blk_finish_plug(&plug);
+			blk_start_plug(&plug);
+			sync_pending = 0;
+		}
 	}
 
-	if (num_sync_run) {
-		num_sync_run = 0;
-		blk_run_backing_dev(bdi, NULL);
-	}
-	/*
-	 * IO has already been through a long path to get here.  Checksumming,
-	 * async helper threads, perhaps compression.  We've done a pretty
-	 * good job of collecting a batch of IO and should just unplug
-	 * the device right away.
-	 *
-	 * This will help anyone who is waiting on the IO, they might have
-	 * already unplugged, but managed to do so before the bio they
-	 * cared about found its way down here.
-	 */
-	blk_run_backing_dev(bdi, NULL);
-
 	cond_resched();
 	if (again)
 		goto loop;
@@ -349,7 +438,7 @@ loop_lock:
 	spin_unlock(&device->io_lock);
 
 done:
-	return 0;
+	blk_finish_plug(&plug);
 }
 
 static void pending_bios_fn(struct btrfs_work *work)
@@ -360,27 +449,34 @@ static void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1   - first time device is seen
+ * 0   - device already known
+ * < 0 - error
+ */
 static noinline int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
+	struct rcu_string *name;
+	int ret = 0;
 	u64 found_transid = btrfs_super_generation(disk_super);
-	char *name;
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-		if (!fs_devices)
-			return -ENOMEM;
-		INIT_LIST_HEAD(&fs_devices->devices);
-		INIT_LIST_HEAD(&fs_devices->alloc_list);
+		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (IS_ERR(fs_devices))
+			return PTR_ERR(fs_devices);
+
 		list_add(&fs_devices->list, &fs_uuids);
-		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		mutex_init(&fs_devices->device_list_mutex);
+
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -390,35 +486,33 @@ static noinline int device_list_add(const char *path,
 		if (fs_devices->opened)
 			return -EBUSY;
 
-		device = kzalloc(sizeof(*device), GFP_NOFS);
-		if (!device) {
+		device = btrfs_alloc_device(NULL, &devid,
+					    disk_super->dev_item.uuid);
+		if (IS_ERR(device)) {
 			/* we can safely leave the fs_devices entry around */
-			return -ENOMEM;
+			return PTR_ERR(device);
 		}
-		device->devid = devid;
-		device->work.func = pending_bios_fn;
-		memcpy(device->uuid, disk_super->dev_item.uuid,
-		       BTRFS_UUID_SIZE);
-		spin_lock_init(&device->io_lock);
-		device->name = kstrdup(path, GFP_NOFS);
-		if (!device->name) {
+
+		name = rcu_string_strdup(path, GFP_NOFS);
+		if (!name) {
 			kfree(device);
 			return -ENOMEM;
 		}
-		INIT_LIST_HEAD(&device->dev_alloc_list);
+		rcu_assign_pointer(device->name, name);
 
 		mutex_lock(&fs_devices->device_list_mutex);
-		list_add(&device->dev_list, &fs_devices->devices);
+		list_add_rcu(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
 		mutex_unlock(&fs_devices->device_list_mutex);
 
+		ret = 1;
 		device->fs_devices = fs_devices;
-		fs_devices->num_devices++;
-	} else if (!device->name || strcmp(device->name, path)) {
-		name = kstrdup(path, GFP_NOFS);
+	} else if (!device->name || strcmp(device->name->str, path)) {
+		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name)
 			return -ENOMEM;
-		kfree(device->name);
-		device->name = name;
+		rcu_string_free(device->name);
+		rcu_assign_pointer(device->name, name);
 		if (device->missing) {
 			fs_devices->missing_devices--;
 			device->missing = 0;
@@ -430,7 +524,8 @@ static noinline int device_list_add(const char *path,
 		fs_devices->latest_trans = found_transid;
 	}
 	*fs_devices_ret = fs_devices;
-	return 0;
+
+	return ret;
 }
 
 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -439,60 +534,85 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	struct btrfs_device *device;
 	struct btrfs_device *orig_dev;
 
-	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!fs_devices)
-		return ERR_PTR(-ENOMEM);
+	fs_devices = alloc_fs_devices(orig->fsid);
+	if (IS_ERR(fs_devices))
+		return fs_devices;
 
-	INIT_LIST_HEAD(&fs_devices->devices);
-	INIT_LIST_HEAD(&fs_devices->alloc_list);
-	INIT_LIST_HEAD(&fs_devices->list);
-	mutex_init(&fs_devices->device_list_mutex);
 	fs_devices->latest_devid = orig->latest_devid;
 	fs_devices->latest_trans = orig->latest_trans;
-	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+	fs_devices->total_devices = orig->total_devices;
 
-	mutex_lock(&orig->device_list_mutex);
+	/* We have held the volume lock, it is safe to get the devices. */
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
-		device = kzalloc(sizeof(*device), GFP_NOFS);
-		if (!device)
-			goto error;
+		struct rcu_string *name;
 
-		device->name = kstrdup(orig_dev->name, GFP_NOFS);
-		if (!device->name) {
-			kfree(device);
+		device = btrfs_alloc_device(NULL, &orig_dev->devid,
+					    orig_dev->uuid);
+		if (IS_ERR(device))
 			goto error;
-		}
 
-		device->devid = orig_dev->devid;
-		device->work.func = pending_bios_fn;
-		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-		spin_lock_init(&device->io_lock);
-		INIT_LIST_HEAD(&device->dev_list);
-		INIT_LIST_HEAD(&device->dev_alloc_list);
+		/*
+		 * This is ok to do without rcu read locked because we hold the
+		 * uuid mutex so nothing we touch in here is going to disappear.
+		 */
+		if (orig_dev->name) {
+			name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+			if (!name) {
+				kfree(device);
+				goto error;
+			}
+			rcu_assign_pointer(device->name, name);
+		}
 
 		list_add(&device->dev_list, &fs_devices->devices);
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
-	mutex_unlock(&orig->device_list_mutex);
 	return fs_devices;
 error:
-	mutex_unlock(&orig->device_list_mutex);
 	free_fs_devices(fs_devices);
 	return ERR_PTR(-ENOMEM);
 }
 
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *fs_devices, int step)
 {
 	struct btrfs_device *device, *next;
 
+	struct block_device *latest_bdev = NULL;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+
 	mutex_lock(&uuid_mutex);
 again:
-	mutex_lock(&fs_devices->device_list_mutex);
+	/* This is the initialized path, it is safe to release the devices. */
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
-		if (device->in_fs_metadata)
+		if (device->in_fs_metadata) {
+			if (!device->is_tgtdev_for_dev_replace &&
+			    (!latest_transid ||
+			     device->generation > latest_transid)) {
+				latest_devid = device->devid;
+				latest_transid = device->generation;
+				latest_bdev = device->bdev;
+			}
 			continue;
+		}
 
+		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+			/*
+			 * In the first step, keep the device which has
+			 * the correct fsid and the devid that is used
+			 * for the dev_replace procedure.
+			 * In the second step, the dev_replace state is
+			 * read from the device tree and it is known
+			 * whether the procedure is really active or
+			 * not, which means whether this device is
+			 * used or whether it should be removed.
+			 */
+			if (step == 0 || device->is_tgtdev_for_dev_replace) {
+				continue;
+			}
+		}
 		if (device->bdev) {
 			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
@@ -501,22 +621,48 @@ again:
 		if (device->writeable) {
 			list_del_init(&device->dev_alloc_list);
 			device->writeable = 0;
-			fs_devices->rw_devices--;
+			if (!device->is_tgtdev_for_dev_replace)
+				fs_devices->rw_devices--;
 		}
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
-		kfree(device->name);
+		rcu_string_free(device->name);
 		kfree(device);
 	}
-	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
 		goto again;
 	}
 
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+
 	mutex_unlock(&uuid_mutex);
-	return 0;
+}
+
+static void __free_device(struct work_struct *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, rcu_work);
+
+	if (device->bdev)
+		blkdev_put(device->bdev, device->mode);
+
+	rcu_string_free(device->name);
+	kfree(device);
+}
+
+static void free_device(struct rcu_head *head)
+{
+	struct btrfs_device *device;
+
+	device = container_of(head, struct btrfs_device, rcu);
+
+	INIT_WORK(&device->rcu_work, __free_device);
+	schedule_work(&device->rcu_work);
 }
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
@@ -526,20 +672,43 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 	if (--fs_devices->opened > 0)
 		return 0;
 
+	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
-		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode);
+		struct btrfs_device *new_device;
+		struct rcu_string *name;
+
+		if (device->bdev)
 			fs_devices->open_devices--;
-		}
-		if (device->writeable) {
+
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 			list_del_init(&device->dev_alloc_list);
 			fs_devices->rw_devices--;
 		}
 
-		device->bdev = NULL;
-		device->writeable = 0;
-		device->in_fs_metadata = 0;
+		if (device->can_discard)
+			fs_devices->num_can_discard--;
+		if (device->missing)
+			fs_devices->missing_devices--;
+
+		new_device = btrfs_alloc_device(NULL, &device->devid,
+						device->uuid);
+		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+		/* Safe because we are under uuid_mutex */
+		if (device->name) {
+			name = rcu_string_strdup(device->name->str, GFP_NOFS);
+			BUG_ON(!name); /* -ENOMEM */
+			rcu_assign_pointer(new_device->name, name);
+		}
+
+		list_replace_rcu(&device->dev_list, &new_device->dev_list);
+		new_device->fs_devices = device->fs_devices;
+
+		call_rcu(&device->rcu, free_device);
 	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
 	WARN_ON(fs_devices->open_devices);
 	WARN_ON(fs_devices->rw_devices);
 	fs_devices->opened = 0;
@@ -567,12 +736,19 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		__btrfs_close_devices(fs_devices);
 		free_fs_devices(fs_devices);
 	}
+	/*
+	 * Wait for rcu kworkers under __btrfs_close_devices
+	 * to finish all blkdev_puts so device is really
+	 * free when umount is done.
+	 */
+	rcu_barrier();
 	return ret;
 }
 
 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 				fmode_t flags, void *holder)
 {
+	struct request_queue *q;
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
 	struct btrfs_device *device;
@@ -593,18 +769,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (!device->name)
 			continue;
 
-		bdev = blkdev_get_by_path(device->name, flags, holder);
-		if (IS_ERR(bdev)) {
-			printk(KERN_INFO "open %s failed\n", device->name);
-			goto error;
-		}
-		set_blocksize(bdev, 4096);
-
-		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
-			goto error_close;
-		}
+		/* Just open everything we can; ignore failures here */
+		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+					    &bdev, &bh))
+			continue;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -629,6 +797,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			seeding = 0;
 		}
 
+		q = bdev_get_queue(bdev);
+		if (blk_queue_discard(q)) {
+			device->can_discard = 1;
+			fs_devices->num_can_discard++;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
 		device->mode = flags;
@@ -637,22 +811,22 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			fs_devices->rotating = 1;
 
 		fs_devices->open_devices++;
-		if (device->writeable) {
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 			fs_devices->rw_devices++;
 			list_add(&device->dev_alloc_list,
 				 &fs_devices->alloc_list);
 		}
+		brelse(bh);
 		continue;
 
 error_brelse:
 		brelse(bh);
-error_close:
 		blkdev_put(bdev, flags);
-error:
 		continue;
 	}
 	if (fs_devices->open_devices == 0) {
-		ret = -EIO;
+		ret = -EINVAL;
 		goto out;
 	}
 	fs_devices->seeding = seeding;
@@ -681,19 +855,35 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	return ret;
 }
 
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
 	struct block_device *bdev;
-	struct buffer_head *bh;
-	int ret;
+	struct page *page;
+	void *p;
+	int ret = -EINVAL;
 	u64 devid;
 	u64 transid;
+	u64 total_devices;
+	u64 bytenr;
+	pgoff_t index;
 
+	/*
+	 * we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	bytenr = btrfs_sb_offset(0);
+	flags |= FMODE_EXCL;
 	mutex_lock(&uuid_mutex);
 
-	flags |= FMODE_EXCL;
 	bdev = blkdev_get_by_path(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
@@ -701,31 +891,60 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error;
 	}
 
-	ret = set_blocksize(bdev, 4096);
-	if (ret)
-		goto error_close;
-	bh = btrfs_read_dev_super(bdev);
-	if (!bh) {
-		ret = -EINVAL;
-		goto error_close;
-	}
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	/* make sure our super fits in the device */
+	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+		goto error_bdev_put;
+
+	/* make sure our super fits in the page */
+	if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+		goto error_bdev_put;
+
+	/* make sure our super doesn't straddle pages on disk */
+	index = bytenr >> PAGE_CACHE_SHIFT;
+	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+		goto error_bdev_put;
+
+	/* pull in the page with our super */
+	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+				   index, GFP_NOFS);
+
+	if (IS_ERR_OR_NULL(page))
+		goto error_bdev_put;
+
+	p = kmap(page);
+
+	/* align our pointer to the offset of the super block */
+	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+	if (btrfs_super_bytenr(disk_super) != bytenr ||
+	    btrfs_super_magic(disk_super) != BTRFS_MAGIC)
+		goto error_unmap;
+
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
-	if (disk_super->label[0])
-		printk(KERN_INFO "device label %s ", disk_super->label);
-	else {
-		/* FIXME, make a readl uuid parser */
-		printk(KERN_INFO "device fsid %llx-%llx ",
-		       *(unsigned long long *)disk_super->fsid,
-		       *(unsigned long long *)(disk_super->fsid + 8));
-	}
-	printk(KERN_CONT "devid %llu transid %llu %s\n",
-	       (unsigned long long)devid, (unsigned long long)transid, path);
+	total_devices = btrfs_super_num_devices(disk_super);
+
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+	if (ret > 0) {
+		if (disk_super->label[0]) {
+			if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+				disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+			printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+		} else {
+			printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+		}
 
-	brelse(bh);
-error_close:
+		printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+		ret = 0;
+	}
+	if (!ret && fs_devices_ret)
+		(*fs_devices_ret)->total_devices = total_devices;
+
+error_unmap:
+	kunmap(page);
+	page_cache_release(page);
+
+error_bdev_put:
 	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
@@ -747,7 +966,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 
 	*length = 0;
 
-	if (start >= device->total_bytes)
+	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -816,9 +1035,37 @@ out:
 	return ret;
 }
 
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_device *device,
+				   u64 *start, u64 len)
+{
+	struct extent_map *em;
+	int ret = 0;
+
+	list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+		struct map_lookup *map;
+		int i;
+
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (map->stripes[i].dev != device)
+				continue;
+			if (map->stripes[i].physical >= *start + len ||
+			    map->stripes[i].physical + em->orig_block_len <=
+			    *start)
+				continue;
+			*start = map->stripes[i].physical +
+				em->orig_block_len;
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+
 /*
  * find_free_dev_extent - find free space in the specified device
- * @trans:	transaction handler
  * @device:	the device which we search the free space in
  * @num_bytes:	the size of the free space that we need
  * @start:	store the start of the free space.
@@ -860,31 +1107,30 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	/* we don't want to overwrite the superblock on the drive,
 	 * so we make sure to start at an offset of at least 1MB
 	 */
-	search_start = 1024 * 1024;
-
-	if (root->fs_info->alloc_start + num_bytes <= search_end)
-		search_start = max(root->fs_info->alloc_start, search_start);
+	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+again:
 	max_hole_start = search_start;
 	max_hole_size = 0;
+	hole_size = 0;
 
-	if (search_start >= search_end) {
+	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
 		ret = -ENOSPC;
-		goto error;
+		goto out;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto error;
-	}
 	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -919,6 +1165,15 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 		if (key.offset > search_start) {
 			hole_size = key.offset - search_start;
 
+			/*
+			 * Have to check before we set max_hole_start, otherwise
+			 * we could end up sending back this offset anyway.
+			 */
+			if (contains_pending_extent(trans, device,
+						    &search_start,
+						    hole_size))
+				hole_size = 0;
+
 			if (hole_size > max_hole_size) {
 				max_hole_start = search_start;
 				max_hole_size = hole_size;
@@ -949,12 +1204,24 @@ next:
 		cond_resched();
 	}
 
-	hole_size = search_end- search_start;
+	/*
+	 * At this point, search_start should be the end of
+	 * allocated dev extents, and when shrinking the device,
+	 * search_end may be smaller than search_start.
+	 */
+	if (search_end > search_start)
+		hole_size = search_end - search_start;
+
 	if (hole_size > max_hole_size) {
 		max_hole_start = search_start;
 		max_hole_size = hole_size;
 	}
 
+	if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+		btrfs_release_path(path);
+		goto again;
+	}
+
 	/* See above. */
 	if (hole_size < num_bytes)
 		ret = -ENOSPC;
@@ -963,7 +1230,6 @@ next:
 
 out:
 	btrfs_free_path(path);
-error:
 	*start = max_hole_start;
 	if (len)
 		*len = max_hole_size;
@@ -989,39 +1255,52 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.objectid = device->devid;
 	key.offset = start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
-
+again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = btrfs_previous_item(root, path, key.objectid,
 					  BTRFS_DEV_EXTENT_KEY);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_dev_extent);
 		BUG_ON(found_key.offset > start || found_key.offset +
 		       btrfs_dev_extent_length(leaf, extent) < start);
-		ret = 0;
+		key = found_key;
+		btrfs_release_path(path);
+		goto again;
 	} else if (ret == 0) {
 		leaf = path->nodes[0];
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_dev_extent);
+	} else {
+		btrfs_error(root->fs_info, ret, "Slot search failed");
+		goto out;
 	}
-	BUG_ON(ret);
 
-	if (device->bytes_used > 0)
-		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0) {
+		u64 len = btrfs_dev_extent_length(leaf, extent);
+		device->bytes_used -= len;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += len;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = btrfs_del_item(trans, root, path);
-	BUG_ON(ret);
-
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Failed to remove dev extent item");
+	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-			   struct btrfs_device *device,
-			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset, u64 start, u64 num_bytes)
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_device *device,
+				  u64 chunk_tree, u64 chunk_objectid,
+				  u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1031,6 +1310,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 
 	WARN_ON(!device->in_fs_metadata);
+	WARN_ON(device->is_tgtdev_for_dev_replace);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1040,7 +1320,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*extent));
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	leaf = path->nodes[0];
 	extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1050,67 +1331,42 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
-		    BTRFS_UUID_SIZE);
+		    btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
+out:
 	btrfs_free_path(path);
 	return ret;
 }
 
-static noinline int find_next_chunk(struct btrfs_root *root,
-				    u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct btrfs_chunk *chunk;
-	struct btrfs_key found_key;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	key.objectid = objectid;
-	key.offset = (u64)-1;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
-
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto error;
-
-	BUG_ON(ret == 0);
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct rb_node *n;
+	u64 ret = 0;
 
-	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
-	if (ret) {
-		*offset = 0;
-	} else {
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
-		if (found_key.objectid != objectid)
-			*offset = 0;
-		else {
-			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					       struct btrfs_chunk);
-			*offset = found_key.offset +
-				btrfs_chunk_length(path->nodes[0], chunk);
-		}
+	em_tree = &fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	n = rb_last(&em_tree->map);
+	if (n) {
+		em = rb_entry(n, struct extent_map, rb_node);
+		ret = em->start + em->len;
 	}
-	ret = 0;
-error:
-	btrfs_free_path(path);
+	read_unlock(&em_tree->lock);
+
 	return ret;
 }
 
-static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
+				    u64 *devid_ret)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
 
-	root = root->fs_info->chunk_root;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1119,20 +1375,21 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = (u64)-1;
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
 
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Corruption */
 
-	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+	ret = btrfs_previous_item(fs_info->chunk_root, path,
+				  BTRFS_DEV_ITEMS_OBJECTID,
 				  BTRFS_DEV_ITEM_KEY);
 	if (ret) {
-		*objectid = 1;
+		*devid_ret = 1;
 	} else {
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		*objectid = found_key.offset + 1;
+		*devid_ret = found_key.offset + 1;
 	}
 	ret = 0;
 error:
@@ -1144,9 +1401,9 @@ error:
  * the device information is stored in the chunk root
  * the btrfs_device struct should be fully filled in
  */
-int btrfs_add_device(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_device *device)
+static int btrfs_add_device(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1186,9 +1443,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
 	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
-	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	ptr = btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	ptr = btrfs_device_fsid(dev_item);
 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1198,6 +1455,22 @@ out:
 	return ret;
 }
 
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ */
+static void update_dev_time(char *path_name)
+{
+	struct file *filp;
+
+	filp = filp_open(path_name, O_RDWR, 0);
+	if (!filp)
+		return;
+	file_update_time(filp);
+	filp_close(filp, NULL);
+	return;
+}
+
 static int btrfs_rm_dev_item(struct btrfs_root *root,
 			     struct btrfs_device *device)
 {
@@ -1248,32 +1521,51 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	struct block_device *bdev;
 	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
+	struct btrfs_fs_devices *cur_devices;
 	u64 all_avail;
 	u64 devid;
 	u64 num_devices;
 	u8 *dev_uuid;
+	unsigned seq;
 	int ret = 0;
+	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
-	mutex_lock(&root->fs_info->volume_mutex);
 
-	all_avail = root->fs_info->avail_data_alloc_bits |
-		root->fs_info->avail_system_alloc_bits |
-		root->fs_info->avail_metadata_alloc_bits;
+	do {
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
 
-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->num_devices <= 4) {
-		printk(KERN_ERR "btrfs: unable to go below four devices "
-		       "on raid10\n");
-		ret = -EINVAL;
+		all_avail = root->fs_info->avail_data_alloc_bits |
+			    root->fs_info->avail_system_alloc_bits |
+			    root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
+	num_devices = root->fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+		WARN_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
+		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
 		goto out;
 	}
 
-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->num_devices <= 2) {
-		printk(KERN_ERR "btrfs: unable to go below two "
-		       "devices on raid1\n");
-		ret = -EINVAL;
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+	    root->fs_info->fs_devices->rw_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
+		goto out;
+	}
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+	    root->fs_info->fs_devices->rw_devices <= 3) {
+		ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
 		goto out;
 	}
 
@@ -1283,40 +1575,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
-		mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held.
+		 */
 		list_for_each_entry(tmp, devices, dev_list) {
-			if (tmp->in_fs_metadata && !tmp->bdev) {
+			if (tmp->in_fs_metadata &&
+			    !tmp->is_tgtdev_for_dev_replace &&
+			    !tmp->bdev) {
 				device = tmp;
 				break;
 			}
 		}
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 		bdev = NULL;
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk(KERN_ERR "btrfs: no missing devices found to "
-			       "remove\n");
+			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 			goto out;
 		}
 	} else {
-		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-					  root->fs_info->bdev_holder);
-		if (IS_ERR(bdev)) {
-			ret = PTR_ERR(bdev);
+		ret = btrfs_get_bdev_and_sb(device_path,
+					    FMODE_WRITE | FMODE_EXCL,
+					    root->fs_info->bdev_holder, 0,
+					    &bdev, &bh);
+		if (ret)
 			goto out;
-		}
-
-		set_blocksize(bdev, 4096);
-		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
-			goto error_close;
-		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		dev_uuid = disk_super->dev_item.uuid;
-		device = btrfs_find_device(root, devid, dev_uuid,
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
 					   disk_super->fsid);
 		if (!device) {
 			ret = -ENOENT;
@@ -1324,38 +1612,63 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 	}
 
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
+		goto error_brelse;
+	}
+
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk(KERN_ERR "btrfs: unable to remove the only writeable "
-		       "device\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
 		goto error_brelse;
 	}
 
 	if (device->writeable) {
+		lock_chunks(root);
 		list_del_init(&device->dev_alloc_list);
+		unlock_chunks(root);
 		root->fs_info->fs_devices->rw_devices--;
+		clear_super = true;
 	}
 
+	mutex_unlock(&uuid_mutex);
 	ret = btrfs_shrink_device(device, 0);
+	mutex_lock(&uuid_mutex);
 	if (ret)
 		goto error_undo;
 
+	/*
+	 * TODO: the superblock still includes this device in its num_devices
+	 * counter although write_all_supers() is not locked out. This
+	 * could give a filesystem state which requires a degraded mount.
+	 */
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
 		goto error_undo;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space = device->total_bytes -
+		device->bytes_used;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	device->in_fs_metadata = 0;
+	btrfs_scrub_cancel_dev(root->fs_info, device);
 
 	/*
 	 * the device list mutex makes sure that we don't change
 	 * the device list while someone else is writing out all
-	 * the device supers.
+	 * the device supers. Whoever is writing all supers, should
+	 * lock the device list mutex before getting the number of
+	 * devices in the super block (super_copy). Conversely,
+	 * whoever updates the number of devices in the super block
+	 * (super_copy) should hold the device list mutex.
 	 */
+
+	cur_devices = device->fs_devices;
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	list_del_init(&device->dev_list);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	list_del_rcu(&device->dev_list);
 
 	device->fs_devices->num_devices--;
+	device->fs_devices->total_devices--;
 
 	if (device->missing)
 		root->fs_info->fs_devices->missing_devices--;
@@ -1368,73 +1681,229 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		blkdev_put(device->bdev, device->mode);
-		device->bdev = NULL;
 		device->fs_devices->open_devices--;
+		/* remove sysfs entry */
+		btrfs_kobj_rm_device(root->fs_info, device);
 	}
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
-	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+	call_rcu(&device->rcu, free_device);
+
+	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	if (device->fs_devices->open_devices == 0) {
+	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
 		fs_devices = root->fs_info->fs_devices;
 		while (fs_devices) {
-			if (fs_devices->seed == device->fs_devices)
+			if (fs_devices->seed == cur_devices) {
+				fs_devices->seed = cur_devices->seed;
 				break;
+			}
 			fs_devices = fs_devices->seed;
 		}
-		fs_devices->seed = device->fs_devices->seed;
-		device->fs_devices->seed = NULL;
-		__btrfs_close_devices(device->fs_devices);
-		free_fs_devices(device->fs_devices);
+		cur_devices->seed = NULL;
+		lock_chunks(root);
+		__btrfs_close_devices(cur_devices);
+		unlock_chunks(root);
+		free_fs_devices(cur_devices);
 	}
 
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
 	/*
 	 * at this point, the device is zero sized.  We want to
 	 * remove it from the devices list and zero out the old super
 	 */
-	if (device->writeable) {
+	if (clear_super && disk_super) {
+		u64 bytenr;
+		int i;
+
 		/* make sure this device isn't detected as part of
 		 * the FS anymore
 		 */
 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
+
+		/* clear the mirror copies of super block on the disk
+		 * being removed, 0th copy is been taken care above and
+		 * the below would take of the rest
+		 */
+		for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+					i_size_read(bdev->bd_inode))
+				break;
+
+			brelse(bh);
+			bh = __bread(bdev, bytenr / 4096,
+					BTRFS_SUPER_INFO_SIZE);
+			if (!bh)
+				continue;
+
+			disk_super = (struct btrfs_super_block *)bh->b_data;
+
+			if (btrfs_super_bytenr(disk_super) != bytenr ||
+				btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+				continue;
+			}
+			memset(&disk_super->magic, 0,
+						sizeof(disk_super->magic));
+			set_buffer_dirty(bh);
+			sync_dirty_buffer(bh);
+		}
 	}
 
-	kfree(device->name);
-	kfree(device);
 	ret = 0;
 
+	if (bdev) {
+		/* Notify udev that device has changed */
+		btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+		/* Update ctime/mtime for device path for libblkid */
+		update_dev_time(device_path);
+	}
+
 error_brelse:
 	brelse(bh);
-error_close:
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
 error_undo:
 	if (device->writeable) {
+		lock_chunks(root);
 		list_add(&device->dev_alloc_list,
 			 &root->fs_info->fs_devices->alloc_list);
+		unlock_chunks(root);
 		root->fs_info->fs_devices->rw_devices++;
 	}
 	goto error_brelse;
 }
 
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+				 struct btrfs_device *srcdev)
+{
+	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+
+	list_del_rcu(&srcdev->dev_list);
+	list_del_rcu(&srcdev->dev_alloc_list);
+	fs_info->fs_devices->num_devices--;
+	if (srcdev->missing) {
+		fs_info->fs_devices->missing_devices--;
+		fs_info->fs_devices->rw_devices++;
+	}
+	if (srcdev->can_discard)
+		fs_info->fs_devices->num_can_discard--;
+	if (srcdev->bdev) {
+		fs_info->fs_devices->open_devices--;
+
+		/* zero out the old super */
+		btrfs_scratch_superblock(srcdev);
+	}
+
+	call_rcu(&srcdev->rcu, free_device);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev)
+{
+	struct btrfs_device *next_device;
+
+	WARN_ON(!tgtdev);
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	if (tgtdev->bdev) {
+		btrfs_scratch_superblock(tgtdev);
+		fs_info->fs_devices->open_devices--;
+	}
+	fs_info->fs_devices->num_devices--;
+	if (tgtdev->can_discard)
+		fs_info->fs_devices->num_can_discard++;
+
+	next_device = list_entry(fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (tgtdev->bdev == fs_info->sb->s_bdev)
+		fs_info->sb->s_bdev = next_device->bdev;
+	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+		fs_info->fs_devices->latest_bdev = next_device->bdev;
+	list_del_rcu(&tgtdev->dev_list);
+
+	call_rcu(&tgtdev->rcu, free_device);
+
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+
+static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+				     struct btrfs_device **device)
+{
+	int ret = 0;
+	struct btrfs_super_block *disk_super;
+	u64 devid;
+	u8 *dev_uuid;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+
+	*device = NULL;
+	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+				    root->fs_info->bdev_holder, 0, &bdev, &bh);
+	if (ret)
+		return ret;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	dev_uuid = disk_super->dev_item.uuid;
+	*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+				    disk_super->fsid);
+	brelse(bh);
+	if (!*device)
+		ret = -ENOENT;
+	blkdev_put(bdev, FMODE_READ);
+	return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device)
+{
+	*device = NULL;
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		devices = &root->fs_info->fs_devices->devices;
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held by the caller.
+		 */
+		list_for_each_entry(tmp, devices, dev_list) {
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				*device = tmp;
+				break;
+			}
+		}
+
+		if (!*device) {
+			btrfs_err(root->fs_info, "no missing device found");
+			return -ENOENT;
+		}
+
+		return 0;
+	} else {
+		return btrfs_find_device_by_path(root, device_path, device);
+	}
+}
+
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
 	struct btrfs_fs_devices *seed_devices;
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
@@ -1442,9 +1911,9 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!seed_devices)
-		return -ENOMEM;
+	seed_devices = __alloc_fs_devices();
+	if (IS_ERR(seed_devices))
+		return PTR_ERR(seed_devices);
 
 	old_devices = clone_fs_devices(fs_devices);
 	if (IS_ERR(old_devices)) {
@@ -1459,7 +1928,11 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	INIT_LIST_HEAD(&seed_devices->devices);
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
 	mutex_init(&seed_devices->device_list_mutex);
-	list_splice_init(&fs_devices->devices, &seed_devices->devices);
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+			      synchronize_rcu);
+
 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
 		device->fs_devices = seed_devices;
@@ -1473,6 +1946,8 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	super_flags = btrfs_super_flags(disk_super) &
 		      ~BTRFS_SUPER_FLAG_SEEDING;
 	btrfs_set_super_flags(disk_super, super_flags);
@@ -1520,7 +1995,7 @@ next_slot:
 				goto error;
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			continue;
 		}
 
@@ -1532,14 +2007,13 @@ next_slot:
 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_dev_item);
 		devid = btrfs_device_id(leaf, dev_item);
-		read_extent_buffer(leaf, dev_uuid,
-				   (unsigned long)btrfs_device_uuid(dev_item),
+		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 				   BTRFS_UUID_SIZE);
-		read_extent_buffer(leaf, fs_uuid,
-				   (unsigned long)btrfs_device_fsid(dev_item),
+		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 				   BTRFS_UUID_SIZE);
-		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
-		BUG_ON(!device);
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+					   fs_uuid);
+		BUG_ON(!device); /* Logic error */
 
 		if (device->fs_devices->seeding) {
 			btrfs_set_device_generation(leaf, dev_item,
@@ -1558,19 +2032,21 @@ error:
 
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
+	struct request_queue *q;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
 	struct block_device *bdev;
 	struct list_head *devices;
 	struct super_block *sb = root->fs_info->sb;
+	struct rcu_string *name;
 	u64 total_bytes;
 	int seeding_dev = 0;
 	int ret = 0;
 
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
-		return -EINVAL;
+		return -EROFS;
 
-	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
 				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
@@ -1582,44 +2058,38 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
-	/*
-	 * we have the volume lock, so we don't need the extra
-	 * device list mutex while reading the list here.
-	 */
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
 			goto error;
 		}
 	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device) {
+	device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+	if (IS_ERR(device)) {
 		/* we can safely leave the fs_devices entry around */
-		ret = -ENOMEM;
+		ret = PTR_ERR(device);
 		goto error;
 	}
 
-	device->name = kstrdup(device_path, GFP_NOFS);
-	if (!device->name) {
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
 		kfree(device);
 		ret = -ENOMEM;
 		goto error;
 	}
-
-	ret = find_next_devid(root, &device->devid);
-	if (ret) {
-		kfree(device->name);
-		kfree(device);
-		goto error;
-	}
+	rcu_assign_pointer(device->name, name);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		kfree(device->name);
+		rcu_string_free(device->name);
 		kfree(device);
 		ret = PTR_ERR(trans);
 		goto error;
@@ -1627,10 +2097,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	lock_chunks(root);
 
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
 	device->writeable = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	spin_lock_init(&device->io_lock);
 	device->generation = trans->transid;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
@@ -1640,49 +2110,77 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
-		ret = btrfs_prepare_sprout(trans, root);
-		BUG_ON(ret);
+		ret = btrfs_prepare_sprout(root);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 
 	device->fs_devices = root->fs_info->fs_devices;
 
-	/*
-	 * we don't want write_supers to jump in here with our device
-	 * half setup
-	 */
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
 	root->fs_info->fs_devices->num_devices++;
 	root->fs_info->fs_devices->open_devices++;
 	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_devices++;
+	if (device->can_discard)
+		root->fs_info->fs_devices->num_can_discard++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space += device->total_bytes;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
 
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+	btrfs_set_super_num_devices(root->fs_info->super_copy,
 				    total_bytes + 1);
+
+	/* add sysfs device entry */
+	btrfs_kobj_add_device(root->fs_info, device);
+
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	if (seeding_dev) {
+		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
 		ret = init_first_rw_device(trans, root, device);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
 		ret = btrfs_finish_sprout(trans, root);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
+
+		/* Sprouting would change fsid of the mounted root,
+		 * so rename the fsid on the sysfs
+		 */
+		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+						root->fs_info->fsid);
+		if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+			goto error_trans;
 	} else {
 		ret = btrfs_add_device(trans, root, device);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
 	}
 
 	/*
@@ -1692,25 +2190,139 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_clear_space_info_full(root->fs_info);
 
 	unlock_chunks(root);
-	btrfs_commit_transaction(trans, root);
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+	ret = btrfs_commit_transaction(trans, root);
 
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 
+		if (ret) /* transaction commit */
+			return ret;
+
 		ret = btrfs_relocate_sys_chunks(root);
-		BUG_ON(ret);
+		if (ret < 0)
+			btrfs_error(root->fs_info, ret,
+				    "Failed to relocate sys chunks after "
+				    "device initialization. This can be fixed "
+				    "using the \"btrfs balance\" command.");
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) == -ENOENT)
+				return 0;
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans, root);
 	}
-out:
-	mutex_unlock(&root->fs_info->volume_mutex);
+
+	/* Update ctime/mtime for libblkid */
+	update_dev_time(device_path);
 	return ret;
+
+error_trans:
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	rcu_string_free(device->name);
+	btrfs_kobj_rm_device(root->fs_info, device);
+	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 	}
-	goto out;
+	return ret;
+}
+
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device **device_out)
+{
+	struct request_queue *q;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *devices;
+	struct rcu_string *name;
+	u64 devid = BTRFS_DEV_REPLACE_DEVID;
+	int ret = 0;
+
+	*device_out = NULL;
+	if (fs_info->fs_devices->seeding)
+		return -EINVAL;
+
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+				  fs_info->bdev_holder);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+	devices = &fs_info->fs_devices->devices;
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+	device = btrfs_alloc_device(NULL, &devid, NULL);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto error;
+	}
+
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+	rcu_assign_pointer(device->name, name);
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	device->writeable = 1;
+	device->generation = 0;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
+	device->dev_root = fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 1;
+	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
+	set_blocksize(device->bdev, 4096);
+	device->fs_devices = fs_info->fs_devices;
+	list_add(&device->dev_list, &fs_info->fs_devices->devices);
+	fs_info->fs_devices->num_devices++;
+	fs_info->fs_devices->open_devices++;
+	if (device->can_discard)
+		fs_info->fs_devices->num_can_discard++;
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	*device_out = device;
+	return ret;
+
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev)
+{
+	WARN_ON(fs_info->fs_devices->rw_devices == 0);
+	tgtdev->io_width = fs_info->dev_root->sectorsize;
+	tgtdev->io_align = fs_info->dev_root->sectorsize;
+	tgtdev->sector_size = fs_info->dev_root->sectorsize;
+	tgtdev->dev_root = fs_info->dev_root;
+	tgtdev->in_fs_metadata = 1;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -1763,13 +2375,14 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
-		&device->dev_root->fs_info->super_copy;
+		device->dev_root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
 	if (!device->writeable)
 		return -EACCES;
-	if (new_size <= device->total_bytes)
+	if (new_size <= device->total_bytes ||
+	    device->is_tgtdev_for_dev_replace)
 		return -EINVAL;
 
 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -1811,19 +2424,28 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	BUG_ON(ret);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0) { /* Logic error or corruption */
+		btrfs_error(root->fs_info, -ENOENT,
+			    "Failed lookup while freeing chunk.");
+		ret = -ENOENT;
+		goto out;
+	}
 
 	ret = btrfs_del_item(trans, root, path);
-	BUG_ON(ret);
-
+	if (ret < 0)
+		btrfs_error(root->fs_info, ret,
+			    "Failed to delete chunk item.");
+out:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -1892,7 +2514,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 		return ret;
 
 	trans = btrfs_start_transaction(root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_std_error(root->fs_info, ret);
+		return ret;
+	}
 
 	lock_chunks(root);
 
@@ -1904,7 +2530,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	read_unlock(&em_tree->lock);
 
-	BUG_ON(em->start > chunk_offset ||
+	BUG_ON(!em || em->start > chunk_offset ||
 	       em->start + em->len < chunk_offset);
 	map = (struct map_lookup *)em->bdev;
 
@@ -1923,6 +2549,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
 	BUG_ON(ret);
 
+	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
 		BUG_ON(ret);
@@ -1935,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	remove_extent_mapping(em_tree, em);
 	write_unlock(&em_tree->lock);
 
-	kfree(map);
-	em->bdev = NULL;
-
 	/* once for the tree */
 	free_extent_map(em);
 	/* once for us */
@@ -1975,7 +2600,7 @@ again:
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
-		BUG_ON(ret == 0);
+		BUG_ON(ret == 0); /* Corruption */
 
 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
 					  key.type);
@@ -1990,7 +2615,7 @@ again:
 		chunk = btrfs_item_ptr(leaf, path->slots[0],
 				       struct btrfs_chunk);
 		chunk_type = btrfs_chunk_type(leaf, chunk);
-		btrfs_release_path(chunk_root, path);
+		btrfs_release_path(path);
 
 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
@@ -2011,8 +2636,7 @@ again:
 		failed = 0;
 		retried = true;
 		goto again;
-	} else if (failed && retried) {
-		WARN_ON(1);
+	} else if (WARN_ON(failed && retried)) {
 		ret = -ENOSPC;
 	}
 error:
@@ -2020,44 +2644,399 @@ error:
 	return ret;
 }
 
-static u64 div_factor(u64 num, int factor)
+static int insert_balance_item(struct btrfs_root *root,
+			       struct btrfs_balance_control *bctl)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+	btrfs_set_balance_data(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+	btrfs_set_balance_meta(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+	btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+	btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
 {
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
 {
-	int ret;
-	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	/*
+	 * Turn on soft mode for chunk types that were being converted.
+	 */
+	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+	/*
+	 * Turn on usage filter if is not already used.  The idea is
+	 * that chunks that we have already balanced should be
+	 * reasonably full.  Don't do it for chunks that are being
+	 * converted - that will keep us from relocating unconverted
+	 * (albeit full) chunks.
+	 */
+	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->data.usage = 90;
+	}
+	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->sys.usage = 90;
+	}
+	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->meta.usage = 90;
+	}
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+	BUG_ON(fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = bctl;
+	spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	BUG_ON(!fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = NULL;
+	spin_unlock(&fs_info->balance_lock);
+
+	kfree(bctl);
+}
+
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_type,
+				 struct btrfs_balance_args *bargs)
+{
+	chunk_type = chunk_to_extended(chunk_type) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	if (bargs->profiles & chunk_type)
+		return 0;
+
+	return 1;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 chunk_used, user_thresh;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	if (bargs->usage == 0)
+		user_thresh = 1;
+	else if (bargs->usage > 100)
+		user_thresh = cache->key.offset;
+	else
+		user_thresh = div_factor_fine(cache->key.offset,
+					      bargs->usage);
+
+	if (chunk_used < user_thresh)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+			      struct btrfs_chunk *chunk,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	int i;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+			return 0;
+	}
+
+	return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	u64 stripe_offset;
+	u64 stripe_length;
+	int factor;
+	int i;
+
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+		return 0;
+
+	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+		factor = num_stripes / 2;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+		factor = num_stripes - 1;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+		factor = num_stripes - 2;
+	} else {
+		factor = num_stripes;
+	}
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+			continue;
+
+		stripe_offset = btrfs_stripe_offset(leaf, stripe);
+		stripe_length = btrfs_chunk_length(leaf, chunk);
+		do_div(stripe_length, factor);
+
+		if (stripe_offset < bargs->pend &&
+		    stripe_offset + stripe_length > bargs->pstart)
+			return 0;
+	}
+
+	return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	if (chunk_offset < bargs->vend &&
+	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+		/* at least part of the chunk is inside this vrange */
+		return 0;
+
+	return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_type,
+				     struct btrfs_balance_args *bargs)
+{
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+		return 0;
+
+	chunk_type = chunk_to_extended(chunk_type) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	if (bargs->target == chunk_type)
+		return 1;
+
+	return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+	struct btrfs_balance_args *bargs = NULL;
+	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+	/* type filter */
+	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+		return 0;
+	}
+
+	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+		bargs = &bctl->data;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+		bargs = &bctl->sys;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+		bargs = &bctl->meta;
+
+	/* profiles filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+	    chunk_profiles_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
+	/* usage filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+	    chunk_devid_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
+	/* drange filter, makes sense only with devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* vrange filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* soft profile changing mode */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+	    chunk_soft_convert_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
+	/*
+	 * limited by count, must be the last filter
+	 */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+		if (bargs->limit == 0)
+			return 0;
+		else
+			bargs->limit--;
+	}
+
+	return 1;
+}
+
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	struct btrfs_root *chunk_root = fs_info->chunk_root;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct list_head *devices;
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
+	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
-
-	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&dev_root->fs_info->volume_mutex);
-	dev_root = dev_root->fs_info->dev_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	int slot;
+	int ret;
+	int enospc_errors = 0;
+	bool counting = true;
+	u64 limit_data = bctl->data.limit;
+	u64 limit_meta = bctl->meta.limit;
+	u64 limit_sys = bctl->sys.limit;
 
 	/* step one make some room on all the devices */
+	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
 		if (!device->writeable ||
-		    device->total_bytes - device->bytes_used > size_to_free)
+		    device->total_bytes - device->bytes_used > size_to_free ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2076,13 +3055,32 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
 	/* step two, relocate all the chunks */
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path) {
+		ret = -ENOMEM;
+		goto error;
+	}
 
+	/* zero out stat counters */
+	spin_lock(&fs_info->balance_lock);
+	memset(&bctl->stat, 0, sizeof(bctl->stat));
+	spin_unlock(&fs_info->balance_lock);
+again:
+	if (!counting) {
+		bctl->data.limit = limit_data;
+		bctl->meta.limit = limit_meta;
+		bctl->sys.limit = limit_sys;
+	}
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
+		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+		    atomic_read(&fs_info->balance_cancel_req)) {
+			ret = -ECANCELED;
+			goto error;
+		}
+
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -2092,37 +3090,736 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		 * failed
 		 */
 		if (ret == 0)
-			break;
+			BUG(); /* FIXME break ? */
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret)
+		if (ret) {
+			ret = 0;
 			break;
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
 		if (found_key.objectid != key.objectid)
 			break;
 
-		/* chunk zero is special */
-		if (found_key.offset == 0)
-			break;
+		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+		if (!counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.considered++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+
+		ret = should_balance_chunk(chunk_root, leaf, chunk,
+					   found_key.offset);
+		btrfs_release_path(path);
+		if (!ret)
+			goto loop;
+
+		if (counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.expected++;
+			spin_unlock(&fs_info->balance_lock);
+			goto loop;
+		}
 
-		btrfs_release_path(chunk_root, path);
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
-		BUG_ON(ret && ret != -ENOSPC);
+		if (ret && ret != -ENOSPC)
+			goto error;
+		if (ret == -ENOSPC) {
+			enospc_errors++;
+		} else {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.completed++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+loop:
+		if (found_key.offset == 0)
+			break;
 		key.offset = found_key.offset - 1;
 	}
-	ret = 0;
+
+	if (counting) {
+		btrfs_release_path(path);
+		counting = false;
+		goto again;
+	}
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->volume_mutex);
+	if (enospc_errors) {
+		btrfs_info(fs_info, "%d enospc errors during balance",
+		       enospc_errors);
+		if (!ret)
+			ret = -ENOSPC;
+	}
+
+	return ret;
+}
+
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+	/* 1) check that all other bits are zeroed */
+	if (flags & ~mask)
+		return 0;
+
+	/* 2) see if profile is reduced */
+	if (flags == 0)
+		return !extended; /* "0" is valid for usual profiles */
+
+	/* true if exactly one bit set */
+	return (flags & (flags - 1)) == 0;
+}
+
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+	/* cancel requested || normal exit path */
+	return atomic_read(&fs_info->balance_cancel_req) ||
+		(atomic_read(&fs_info->balance_pause_req) == 0 &&
+		 atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+
+	unset_balance_control(fs_info);
+	ret = del_balance_item(fs_info->tree_root);
+	if (ret)
+		btrfs_std_error(fs_info, ret);
+
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+}
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	u64 allowed;
+	int mixed = 0;
+	int ret;
+	u64 num_devices;
+	unsigned seq;
+
+	if (btrfs_fs_closing(fs_info) ||
+	    atomic_read(&fs_info->balance_pause_req) ||
+	    atomic_read(&fs_info->balance_cancel_req)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	/*
+	 * In case of mixed groups both data and meta should be picked,
+	 * and identical options should be given for both of them.
+	 */
+	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+	if (mixed && (bctl->flags & allowed)) {
+		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+			btrfs_err(fs_info, "with mixed groups data and "
+				   "metadata balance options must be the same");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	num_devices = fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+		BUG_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	if (num_devices == 1)
+		allowed |= BTRFS_BLOCK_GROUP_DUP;
+	else if (num_devices > 1)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+	if (num_devices > 2)
+		allowed |= BTRFS_BLOCK_GROUP_RAID5;
+	if (num_devices > 3)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+			    BTRFS_BLOCK_GROUP_RAID6);
+	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
+	     (bctl->data.target & ~allowed))) {
+		btrfs_err(fs_info, "unable to start balance with target "
+			   "data profile %llu",
+		       bctl->data.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
+	     (bctl->meta.target & ~allowed))) {
+		btrfs_err(fs_info,
+			   "unable to start balance with target metadata profile %llu",
+		       bctl->meta.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
+	     (bctl->sys.target & ~allowed))) {
+		btrfs_err(fs_info,
+			   "unable to start balance with target system profile %llu",
+		       bctl->sys.target);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow dup'ed data chunks only in mixed mode */
+	if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
+		btrfs_err(fs_info, "dup for data is not allowed");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow to reduce meta or sys integrity only if force set */
+	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_RAID10 |
+			BTRFS_BLOCK_GROUP_RAID5 |
+			BTRFS_BLOCK_GROUP_RAID6;
+	do {
+		seq = read_seqbegin(&fs_info->profiles_lock);
+
+		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_system_alloc_bits & allowed) &&
+		     !(bctl->sys.target & allowed)) ||
+		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_metadata_alloc_bits & allowed) &&
+		     !(bctl->meta.target & allowed))) {
+			if (bctl->flags & BTRFS_BALANCE_FORCE) {
+				btrfs_info(fs_info, "force reducing metadata integrity");
+			} else {
+				btrfs_err(fs_info, "balance will reduce metadata "
+					   "integrity, use force if you want this");
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+	} while (read_seqretry(&fs_info->profiles_lock, seq));
+
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		int num_tolerated_disk_barrier_failures;
+		u64 target = bctl->sys.target;
+
+		num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+		if (num_tolerated_disk_barrier_failures > 0 &&
+		    (target &
+		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+			num_tolerated_disk_barrier_failures = 0;
+		else if (num_tolerated_disk_barrier_failures > 1 &&
+			 (target &
+			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+			num_tolerated_disk_barrier_failures = 1;
+
+		fs_info->num_tolerated_disk_barrier_failures =
+			num_tolerated_disk_barrier_failures;
+	}
+
+	ret = insert_balance_item(fs_info->tree_root, bctl);
+	if (ret && ret != -EEXIST)
+		goto out;
+
+	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+		BUG_ON(ret == -EEXIST);
+		set_balance_control(bctl);
+	} else {
+		BUG_ON(ret != -EEXIST);
+		spin_lock(&fs_info->balance_lock);
+		update_balance_args(bctl);
+		spin_unlock(&fs_info->balance_lock);
+	}
+
+	atomic_inc(&fs_info->balance_running);
+	mutex_unlock(&fs_info->balance_mutex);
+
+	ret = __btrfs_balance(fs_info);
+
+	mutex_lock(&fs_info->balance_mutex);
+	atomic_dec(&fs_info->balance_running);
+
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
+	if (bargs) {
+		memset(bargs, 0, sizeof(*bargs));
+		update_ioctl_balance_args(fs_info, 0, bargs);
+	}
+
+	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+	    balance_need_close(fs_info)) {
+		__cancel_balance(fs_info);
+	}
+
+	wake_up(&fs_info->balance_wait_q);
+
+	return ret;
+out:
+	if (bctl->flags & BTRFS_BALANCE_RESUME)
+		__cancel_balance(fs_info);
+	else {
+		kfree(bctl);
+		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+	}
+	return ret;
+}
+
+static int balance_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	int ret = 0;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (fs_info->balance_ctl) {
+		btrfs_info(fs_info, "continuing balance");
+		ret = btrfs_balance(fs_info->balance_ctl, NULL);
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+
+	return ret;
+}
+
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *tsk;
+
+	spin_lock(&fs_info->balance_lock);
+	if (!fs_info->balance_ctl) {
+		spin_unlock(&fs_info->balance_lock);
+		return 0;
+	}
+	spin_unlock(&fs_info->balance_lock);
+
+	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+		btrfs_info(fs_info, "force skipping balance");
+		return 0;
+	}
+
+	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+	return PTR_ERR_OR_ZERO(tsk);
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) { /* ret = -ENOENT; */
+		ret = 0;
+		goto out;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	bctl->fs_info = fs_info;
+	bctl->flags = btrfs_balance_flags(leaf, item);
+	bctl->flags |= BTRFS_BALANCE_RESUME;
+
+	btrfs_balance_data(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+	btrfs_balance_meta(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+	btrfs_balance_sys(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	set_balance_control(bctl);
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	if (atomic_read(&fs_info->balance_running)) {
+		atomic_inc(&fs_info->balance_pause_req);
+		mutex_unlock(&fs_info->balance_mutex);
+
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+
+		mutex_lock(&fs_info->balance_mutex);
+		/* we are good with balance_ctl ripped off from under us */
+		BUG_ON(atomic_read(&fs_info->balance_running));
+		atomic_dec(&fs_info->balance_pause_req);
+	} else {
+		ret = -ENOTCONN;
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->balance_cancel_req);
+	/*
+	 * if we are running just wait and return, balance item is
+	 * deleted in btrfs_balance in this case
+	 */
+	if (atomic_read(&fs_info->balance_running)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+		mutex_lock(&fs_info->balance_mutex);
+	} else {
+		/* __cancel_balance needs volume_mutex */
+		mutex_unlock(&fs_info->balance_mutex);
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+
+		if (fs_info->balance_ctl)
+			__cancel_balance(fs_info);
+
+		mutex_unlock(&fs_info->volume_mutex);
+	}
+
+	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+	atomic_dec(&fs_info->balance_cancel_req);
+	mutex_unlock(&fs_info->balance_mutex);
+	return 0;
+}
+
+static int btrfs_uuid_scan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path = NULL;
+	int ret = 0;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_root_item root_item;
+	u32 item_size;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+
+	max_key.objectid = (u64)-1;
+	max_key.type = BTRFS_ROOT_ITEM_KEY;
+	max_key.offset = (u64)-1;
+
+	path->keep_locks = 1;
+
+	while (1) {
+		ret = btrfs_search_forward(root, &key, path, 0);
+		if (ret) {
+			if (ret > 0)
+				ret = 0;
+			break;
+		}
+
+		if (key.type != BTRFS_ROOT_ITEM_KEY ||
+		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
+			goto skip;
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		item_size = btrfs_item_size_nr(eb, slot);
+		if (item_size < sizeof(root_item))
+			goto skip;
+
+		read_extent_buffer(eb, &root_item,
+				   btrfs_item_ptr_offset(eb, slot),
+				   (int)sizeof(root_item));
+		if (btrfs_root_refs(&root_item) == 0)
+			goto skip;
+
+		if (!btrfs_is_empty_uuid(root_item.uuid) ||
+		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
+			if (trans)
+				goto update_tree;
+
+			btrfs_release_path(path);
+			/*
+			 * 1 - subvol uuid item
+			 * 1 - received_subvol uuid item
+			 */
+			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
+			continue;
+		} else {
+			goto skip;
+		}
+update_tree:
+		if (!btrfs_is_empty_uuid(root_item.uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.uuid,
+						  BTRFS_UUID_KEY_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				btrfs_warn(fs_info, "uuid_tree_add failed %d",
+					ret);
+				break;
+			}
+		}
+
+		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.received_uuid,
+						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				btrfs_warn(fs_info, "uuid_tree_add failed %d",
+					ret);
+				break;
+			}
+		}
+
+skip:
+		if (trans) {
+			ret = btrfs_end_transaction(trans, fs_info->uuid_root);
+			trans = NULL;
+			if (ret)
+				break;
+		}
+
+		btrfs_release_path(path);
+		if (key.offset < (u64)-1) {
+			key.offset++;
+		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+		} else if (key.objectid < (u64)-1) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.objectid++;
+		} else {
+			break;
+		}
+		cond_resched();
+	}
+
+out:
+	btrfs_free_path(path);
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fs_info->uuid_root);
+	if (ret)
+		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
+	else
+		fs_info->update_uuid_tree_gen = 1;
+	up(&fs_info->uuid_tree_rescan_sem);
+	return 0;
+}
+
+/*
+ * Callback for btrfs_uuid_tree_iterate().
+ * returns:
+ * 0	check succeeded, the entry is not outdated.
+ * < 0	if an error occured.
+ * > 0	if the check failed, which means the caller shall remove the entry.
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+				       u8 *uuid, u8 type, u64 subid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct btrfs_root *subvol_root;
+
+	if (type != BTRFS_UUID_KEY_SUBVOL &&
+	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+		goto out;
+
+	key.objectid = subid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(subvol_root)) {
+		ret = PTR_ERR(subvol_root);
+		if (ret == -ENOENT)
+			ret = 1;
+		goto out;
+	}
+
+	switch (type) {
+	case BTRFS_UUID_KEY_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.received_uuid,
+			   BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	}
+
+out:
 	return ret;
 }
 
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+	int ret;
+
+	/*
+	 * 1st step is to iterate through the existing UUID tree and
+	 * to delete all entries that contain outdated data.
+	 * 2nd step is to add all missing entries to the UUID tree.
+	 */
+	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
+	if (ret < 0) {
+		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
+		up(&fs_info->uuid_tree_rescan_sem);
+		return ret;
+	}
+	return btrfs_uuid_scan_kthread(data);
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *uuid_root;
+	struct task_struct *task;
+	int ret;
+
+	/*
+	 * 1 - root node
+	 * 1 - root item
+	 */
+	trans = btrfs_start_transaction(tree_root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	uuid_root = btrfs_create_tree(trans, fs_info,
+				      BTRFS_UUID_TREE_OBJECTID);
+	if (IS_ERR(uuid_root)) {
+		btrfs_abort_transaction(trans, tree_root,
+					PTR_ERR(uuid_root));
+		return PTR_ERR(uuid_root);
+	}
+
+	fs_info->uuid_root = uuid_root;
+
+	ret = btrfs_commit_transaction(trans, tree_root);
+	if (ret)
+		return ret;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		btrfs_warn(fs_info, "failed to start uuid_scan task");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		btrfs_warn(fs_info, "failed to start uuid_rescan task");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -2144,12 +3841,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	bool retried = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
 
-	if (new_size >= device->total_bytes)
+	if (device->is_tgtdev_for_dev_replace)
 		return -EINVAL;
 
 	path = btrfs_alloc_path();
@@ -2161,8 +3858,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes -= diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space -= diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	unlock_chunks(root);
 
 again:
@@ -2170,7 +3871,7 @@ again:
 	key.offset = (u64)-1;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	while (1) {
+	do {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto done;
@@ -2180,7 +3881,7 @@ again:
 			goto done;
 		if (ret) {
 			ret = 0;
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			break;
 		}
 
@@ -2189,7 +3890,7 @@ again:
 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
 		if (key.objectid != device->devid) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			break;
 		}
 
@@ -2197,14 +3898,14 @@ again:
 		length = btrfs_dev_extent_length(l, dev_extent);
 
 		if (key.offset + length <= new_size) {
-			btrfs_release_path(root, path);
+			btrfs_release_path(path);
 			break;
 		}
 
 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
-		btrfs_release_path(root, path);
+		btrfs_release_path(path);
 
 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
 					   chunk_offset);
@@ -2212,8 +3913,7 @@ again:
 			goto done;
 		if (ret == -ENOSPC)
 			failed++;
-		key.offset -= 1;
-	}
+	} while (key.offset-- > 0);
 
 	if (failed && !retried) {
 		failed = 0;
@@ -2226,6 +3926,9 @@ again:
 		device->total_bytes = old_size;
 		if (device->writeable)
 			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
 		unlock_chunks(root);
 		goto done;
 	}
@@ -2256,18 +3959,18 @@ done:
 	return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key disk_key;
 	u32 array_size;
 	u8 *ptr;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
-	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+	if (array_size + item_size + sizeof(disk_key)
+			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
 		return -EFBIG;
 
 	ptr = super_copy->sys_chunk_array + array_size;
@@ -2280,275 +3983,312 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
-					int num_stripes, int sub_stripes)
+/*
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
 {
-	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
-		return calc_size;
-	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		return calc_size * (num_stripes / sub_stripes);
-	else
-		return calc_size * num_stripes;
-}
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
 
-/* Used to sort the devices by max_avail(descending sort) */
-int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
-{
-	if (((struct btrfs_device_info *)dev_info1)->max_avail >
-	    ((struct btrfs_device_info *)dev_info2)->max_avail)
+	if (di_a->max_avail > di_b->max_avail)
 		return -1;
-	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
-		 ((struct btrfs_device_info *)dev_info2)->max_avail)
+	if (di_a->max_avail < di_b->max_avail)
 		return 1;
-	else
-		return 0;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
 }
 
-static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
-				 int *num_stripes, int *min_stripes,
-				 int *sub_stripes)
+static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = {
+		.sub_stripes	= 2,
+		.dev_stripes	= 1,
+		.devs_max	= 0,	/* 0 == as many as possible */
+		.devs_min	= 4,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID1] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 2,
+		.devs_min	= 2,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_DUP] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 2,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID0] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_SINGLE] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_RAID5] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID6] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 3,
+		.devs_increment	= 1,
+		.ncopies	= 3,
+	},
+};
+
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
 {
-	*num_stripes = 1;
-	*min_stripes = 1;
-	*sub_stripes = 0;
+	/* TODO allow them to set a preferred stripe size */
+	return 64 * 1024;
+}
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		*num_stripes = fs_devices->rw_devices;
-		*min_stripes = 2;
-	}
-	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-		*num_stripes = 2;
-		*min_stripes = 2;
-	}
-	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		if (fs_devices->rw_devices < 2)
-			return -ENOSPC;
-		*num_stripes = 2;
-		*min_stripes = 2;
-	}
-	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		*num_stripes = fs_devices->rw_devices;
-		if (*num_stripes < 4)
-			return -ENOSPC;
-		*num_stripes &= ~(u32)1;
-		*sub_stripes = 2;
-		*min_stripes = 4;
-	}
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+		return;
 
-	return 0;
+	btrfs_set_fs_incompat(info, RAID56);
 }
 
-static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
-				    u64 proposed_size, u64 type,
-				    int num_stripes, int small_stripe)
+#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)		\
+			- sizeof(struct btrfs_item)		\
+			- sizeof(struct btrfs_chunk))		\
+			/ sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
+				- 2 * sizeof(struct btrfs_disk_key)	\
+				- 2 * sizeof(struct btrfs_chunk))	\
+				/ sizeof(struct btrfs_stripe) + 1)
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, u64 start,
+			       u64 type)
 {
-	int min_stripe_size = 1 * 1024 * 1024;
-	u64 calc_size = proposed_size;
-	u64 max_chunk_size = calc_size;
-	int ncopies = 1;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+	struct list_head *cur;
+	struct map_lookup *map = NULL;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_device_info *devices_info = NULL;
+	u64 total_avail;
+	int num_stripes;	/* total number of stripes to allocate */
+	int data_stripes;	/* number of stripes that count for
+				   block group size */
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+	int ret;
+	u64 max_stripe_size;
+	u64 max_chunk_size;
+	u64 stripe_size;
+	u64 num_bytes;
+	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
+	int ndevs;
+	int i;
+	int j;
+	int index;
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID1 |
-		    BTRFS_BLOCK_GROUP_DUP |
-		    BTRFS_BLOCK_GROUP_RAID10))
-		ncopies = 2;
+	BUG_ON(!alloc_profile_is_valid(type, 0));
+
+	if (list_empty(&fs_devices->alloc_list))
+		return -ENOSPC;
+
+	index = __get_raid_index(type);
+
+	sub_stripes = btrfs_raid_array[index].sub_stripes;
+	dev_stripes = btrfs_raid_array[index].dev_stripes;
+	devs_max = btrfs_raid_array[index].devs_max;
+	devs_min = btrfs_raid_array[index].devs_min;
+	devs_increment = btrfs_raid_array[index].devs_increment;
+	ncopies = btrfs_raid_array[index].ncopies;
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
-		max_chunk_size = 10 * calc_size;
-		min_stripe_size = 64 * 1024 * 1024;
+		max_stripe_size = 1024 * 1024 * 1024;
+		max_chunk_size = 10 * max_stripe_size;
+		if (!devs_max)
+			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		max_chunk_size = 256 * 1024 * 1024;
-		min_stripe_size = 32 * 1024 * 1024;
+		/* for larger filesystems, use larger metadata chunks */
+		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+			max_stripe_size = 1024 * 1024 * 1024;
+		else
+			max_stripe_size = 256 * 1024 * 1024;
+		max_chunk_size = max_stripe_size;
+		if (!devs_max)
+			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		calc_size = 8 * 1024 * 1024;
-		max_chunk_size = calc_size * 2;
-		min_stripe_size = 1 * 1024 * 1024;
+		max_stripe_size = 32 * 1024 * 1024;
+		max_chunk_size = 2 * max_stripe_size;
+		if (!devs_max)
+			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
+	} else {
+		btrfs_err(info, "invalid chunk type 0x%llx requested",
+		       type);
+		BUG_ON(1);
 	}
 
 	/* we don't want a chunk larger than 10% of writeable space */
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
-	if (calc_size * num_stripes > max_chunk_size * ncopies) {
-		calc_size = max_chunk_size * ncopies;
-		do_div(calc_size, num_stripes);
-		do_div(calc_size, BTRFS_STRIPE_LEN);
-		calc_size *= BTRFS_STRIPE_LEN;
-	}
+	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+			       GFP_NOFS);
+	if (!devices_info)
+		return -ENOMEM;
 
-	/* we don't want tiny stripes */
-	if (!small_stripe)
-		calc_size = max_t(u64, min_stripe_size, calc_size);
+	cur = fs_devices->alloc_list.next;
 
 	/*
-	 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
-	 * we end up with something bigger than a stripe
+	 * in the first pass through the devices list, we gather information
+	 * about the available holes on each device.
 	 */
-	calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
-
-	do_div(calc_size, BTRFS_STRIPE_LEN);
-	calc_size *= BTRFS_STRIPE_LEN;
+	ndevs = 0;
+	while (cur != &fs_devices->alloc_list) {
+		struct btrfs_device *device;
+		u64 max_avail;
+		u64 dev_offset;
 
-	return calc_size;
-}
-
-static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
-						      int num_stripes)
-{
-	struct map_lookup *new;
-	size_t len = map_lookup_size(num_stripes);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
-	BUG_ON(map->num_stripes < num_stripes);
+		cur = cur->next;
 
-	if (map->num_stripes == num_stripes)
-		return map;
+		if (!device->writeable) {
+			WARN(1, KERN_ERR
+			       "BTRFS: read-only device in alloc_list\n");
+			continue;
+		}
 
-	new = kmalloc(len, GFP_NOFS);
-	if (!new) {
-		/* just change map->num_stripes */
-		map->num_stripes = num_stripes;
-		return map;
-	}
+		if (!device->in_fs_metadata ||
+		    device->is_tgtdev_for_dev_replace)
+			continue;
 
-	memcpy(new, map, len);
-	new->num_stripes = num_stripes;
-	kfree(map);
-	return new;
-}
+		if (device->total_bytes > device->bytes_used)
+			total_avail = device->total_bytes - device->bytes_used;
+		else
+			total_avail = 0;
 
-/*
- * helper to allocate device space from btrfs_device_info, in which we stored
- * max free space information of every device. It is used when we can not
- * allocate chunks by default size.
- *
- * By this helper, we can allocate a new chunk as larger as possible.
- */
-static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_devices *fs_devices,
-				    struct btrfs_device_info *devices,
-				    int nr_device, u64 type,
-				    struct map_lookup **map_lookup,
-				    int min_stripes, u64 *stripe_size)
-{
-	int i, index, sort_again = 0;
-	int min_devices = min_stripes;
-	u64 max_avail, min_free;
-	struct map_lookup *map = *map_lookup;
-	int ret;
+		/* If there is no space on this device, skip it. */
+		if (total_avail == 0)
+			continue;
 
-	if (nr_device < min_stripes)
-		return -ENOSPC;
+		ret = find_free_dev_extent(trans, device,
+					   max_stripe_size * dev_stripes,
+					   &dev_offset, &max_avail);
+		if (ret && ret != -ENOSPC)
+			goto error;
 
-	btrfs_descending_sort_devices(devices, nr_device);
+		if (ret == 0)
+			max_avail = max_stripe_size * dev_stripes;
 
-	max_avail = devices[0].max_avail;
-	if (!max_avail)
-		return -ENOSPC;
+		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+			continue;
 
-	for (i = 0; i < nr_device; i++) {
-		/*
-		 * if dev_offset = 0, it means the free space of this device
-		 * is less than what we need, and we didn't search max avail
-		 * extent on this device, so do it now.
-		 */
-		if (!devices[i].dev_offset) {
-			ret = find_free_dev_extent(trans, devices[i].dev,
-						   max_avail,
-						   &devices[i].dev_offset,
-						   &devices[i].max_avail);
-			if (ret != 0 && ret != -ENOSPC)
-				return ret;
-			sort_again = 1;
+		if (ndevs == fs_devices->rw_devices) {
+			WARN(1, "%s: found more than %llu devices\n",
+			     __func__, fs_devices->rw_devices);
+			break;
 		}
+		devices_info[ndevs].dev_offset = dev_offset;
+		devices_info[ndevs].max_avail = max_avail;
+		devices_info[ndevs].total_avail = total_avail;
+		devices_info[ndevs].dev = device;
+		++ndevs;
 	}
 
-	/* we update the max avail free extent of each devices, sort again */
-	if (sort_again)
-		btrfs_descending_sort_devices(devices, nr_device);
-
-	if (type & BTRFS_BLOCK_GROUP_DUP)
-		min_devices = 1;
-
-	if (!devices[min_devices - 1].max_avail)
-		return -ENOSPC;
+	/*
+	 * now sort the devices by hole size / available space
+	 */
+	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_info, NULL);
 
-	max_avail = devices[min_devices - 1].max_avail;
-	if (type & BTRFS_BLOCK_GROUP_DUP)
-		do_div(max_avail, 2);
+	/* round down to number of usable stripes */
+	ndevs -= ndevs % devs_increment;
 
-	max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
-					     min_stripes, 1);
-	if (type & BTRFS_BLOCK_GROUP_DUP)
-		min_free = max_avail * 2;
-	else
-		min_free = max_avail;
+	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+		ret = -ENOSPC;
+		goto error;
+	}
 
-	if (min_free > devices[min_devices - 1].max_avail)
-		return -ENOSPC;
+	if (devs_max && ndevs > devs_max)
+		ndevs = devs_max;
+	/*
+	 * the primary goal is to maximize the number of stripes, so use as many
+	 * devices as possible, even if the stripes are not maximum sized.
+	 */
+	stripe_size = devices_info[ndevs-1].max_avail;
+	num_stripes = ndevs * dev_stripes;
 
-	map = __shrink_map_lookup_stripes(map, min_stripes);
-	*stripe_size = max_avail;
+	/*
+	 * this will have to be fixed for RAID1 and RAID10 over
+	 * more drives
+	 */
+	data_stripes = num_stripes / ncopies;
 
-	index = 0;
-	for (i = 0; i < min_stripes; i++) {
-		map->stripes[i].dev = devices[index].dev;
-		map->stripes[i].physical = devices[index].dev_offset;
-		if (type & BTRFS_BLOCK_GROUP_DUP) {
-			i++;
-			map->stripes[i].dev = devices[index].dev;
-			map->stripes[i].physical = devices[index].dev_offset +
-						   max_avail;
-		}
-		index++;
+	if (type & BTRFS_BLOCK_GROUP_RAID5) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 1;
+	}
+	if (type & BTRFS_BLOCK_GROUP_RAID6) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 2;
 	}
-	*map_lookup = map;
 
-	return 0;
-}
+	/*
+	 * Use the number of data stripes to figure out how big this chunk
+	 * is really going to be in terms of logical address space,
+	 * and compare that answer with the max chunk size
+	 */
+	if (stripe_size * data_stripes > max_chunk_size) {
+		u64 mask = (1ULL << 24) - 1;
+		stripe_size = max_chunk_size;
+		do_div(stripe_size, data_stripes);
 
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root,
-			       struct map_lookup **map_ret,
-			       u64 *num_bytes, u64 *stripe_size,
-			       u64 start, u64 type)
-{
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_device *device = NULL;
-	struct btrfs_fs_devices *fs_devices = info->fs_devices;
-	struct list_head *cur;
-	struct map_lookup *map;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
-	struct btrfs_device_info *devices_info;
-	struct list_head private_devs;
-	u64 calc_size = 1024 * 1024 * 1024;
-	u64 min_free;
-	u64 avail;
-	u64 dev_offset;
-	int num_stripes;
-	int min_stripes;
-	int sub_stripes;
-	int min_devices;	/* the min number of devices we need */
-	int i;
-	int ret;
-	int index;
+		/* bump the answer up to a 16MB boundary */
+		stripe_size = (stripe_size + mask) & ~mask;
 
-	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (type & BTRFS_BLOCK_GROUP_DUP)) {
-		WARN_ON(1);
-		type &= ~BTRFS_BLOCK_GROUP_DUP;
+		/* but don't go higher than the limits we found
+		 * while searching for free extents
+		 */
+		if (stripe_size > devices_info[ndevs-1].max_avail)
+			stripe_size = devices_info[ndevs-1].max_avail;
 	}
-	if (list_empty(&fs_devices->alloc_list))
-		return -ENOSPC;
 
-	ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
-				    &min_stripes, &sub_stripes);
-	if (ret)
-		return ret;
+	do_div(stripe_size, dev_stripes);
 
-	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
-			       GFP_NOFS);
-	if (!devices_info)
-		return -ENOMEM;
+	/* align to BTRFS_STRIPE_LEN */
+	do_div(stripe_size, raid_stripe_len);
+	stripe_size *= raid_stripe_len;
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
@@ -2557,183 +4297,156 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	}
 	map->num_stripes = num_stripes;
 
-	cur = fs_devices->alloc_list.next;
-	index = 0;
-	i = 0;
-
-	calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
-					     num_stripes, 0);
-
-	if (type & BTRFS_BLOCK_GROUP_DUP) {
-		min_free = calc_size * 2;
-		min_devices = 1;
-	} else {
-		min_free = calc_size;
-		min_devices = min_stripes;
-	}
-
-	INIT_LIST_HEAD(&private_devs);
-	while (index < num_stripes) {
-		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-		BUG_ON(!device->writeable);
-		if (device->total_bytes > device->bytes_used)
-			avail = device->total_bytes - device->bytes_used;
-		else
-			avail = 0;
-		cur = cur->next;
-
-		if (device->in_fs_metadata && avail >= min_free) {
-			ret = find_free_dev_extent(trans, device, min_free,
-						   &devices_info[i].dev_offset,
-						   &devices_info[i].max_avail);
-			if (ret == 0) {
-				list_move_tail(&device->dev_alloc_list,
-					       &private_devs);
-				map->stripes[index].dev = device;
-				map->stripes[index].physical =
-						devices_info[i].dev_offset;
-				index++;
-				if (type & BTRFS_BLOCK_GROUP_DUP) {
-					map->stripes[index].dev = device;
-					map->stripes[index].physical =
-						devices_info[i].dev_offset +
-						calc_size;
-					index++;
-				}
-			} else if (ret != -ENOSPC)
-				goto error;
-
-			devices_info[i].dev = device;
-			i++;
-		} else if (device->in_fs_metadata &&
-			   avail >= BTRFS_STRIPE_LEN) {
-			devices_info[i].dev = device;
-			devices_info[i].max_avail = avail;
-			i++;
-		}
-
-		if (cur == &fs_devices->alloc_list)
-			break;
-	}
-
-	list_splice(&private_devs, &fs_devices->alloc_list);
-	if (index < num_stripes) {
-		if (index >= min_stripes) {
-			num_stripes = index;
-			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-				num_stripes /= sub_stripes;
-				num_stripes *= sub_stripes;
-			}
-
-			map = __shrink_map_lookup_stripes(map, num_stripes);
-		} else if (i >= min_devices) {
-			ret = __btrfs_alloc_tiny_space(trans, fs_devices,
-						       devices_info, i, type,
-						       &map, min_stripes,
-						       &calc_size);
-			if (ret)
-				goto error;
-		} else {
-			ret = -ENOSPC;
-			goto error;
+	for (i = 0; i < ndevs; ++i) {
+		for (j = 0; j < dev_stripes; ++j) {
+			int s = i * dev_stripes + j;
+			map->stripes[s].dev = devices_info[i].dev;
+			map->stripes[s].physical = devices_info[i].dev_offset +
+						   j * stripe_size;
 		}
 	}
 	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = BTRFS_STRIPE_LEN;
-	map->io_align = BTRFS_STRIPE_LEN;
-	map->io_width = BTRFS_STRIPE_LEN;
+	map->stripe_len = raid_stripe_len;
+	map->io_align = raid_stripe_len;
+	map->io_width = raid_stripe_len;
 	map->type = type;
 	map->sub_stripes = sub_stripes;
 
-	*map_ret = map;
-	*stripe_size = calc_size;
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 map->num_stripes, sub_stripes);
+	num_bytes = stripe_size * data_stripes;
 
-	em = alloc_extent_map(GFP_NOFS);
+	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+
+	em = alloc_extent_map();
 	if (!em) {
+		kfree(map);
 		ret = -ENOMEM;
 		goto error;
 	}
+	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
 	em->bdev = (struct block_device *)map;
 	em->start = start;
-	em->len = *num_bytes;
+	em->len = num_bytes;
 	em->block_start = 0;
 	em->block_len = em->len;
+	em->orig_block_len = stripe_size;
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
+	if (!ret) {
+		list_add_tail(&em->list, &trans->transaction->pending_chunks);
+		atomic_inc(&em->refs);
+	}
 	write_unlock(&em_tree->lock);
-	BUG_ON(ret);
-	free_extent_map(em);
+	if (ret) {
+		free_extent_map(em);
+		goto error;
+	}
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-				     start, *num_bytes);
-	BUG_ON(ret);
-
-	index = 0;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
-		dev_offset = map->stripes[index].physical;
+				     start, num_bytes);
+	if (ret)
+		goto error_del_extent;
 
-		ret = btrfs_alloc_dev_extent(trans, device,
-				info->chunk_root->root_key.objectid,
-				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-				start, dev_offset, calc_size);
-		BUG_ON(ret);
-		index++;
-	}
+	free_extent_map(em);
+	check_raid56_incompat_flag(extent_root->fs_info, type);
 
 	kfree(devices_info);
 	return 0;
 
+error_del_extent:
+	write_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	write_unlock(&em_tree->lock);
+
+	/* One for our allocation */
+	free_extent_map(em);
+	/* One for the tree reference */
+	free_extent_map(em);
 error:
-	kfree(map);
 	kfree(devices_info);
 	return ret;
 }
 
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 				struct btrfs_root *extent_root,
-				struct map_lookup *map, u64 chunk_offset,
-				u64 chunk_size, u64 stripe_size)
+				u64 chunk_offset, u64 chunk_size)
 {
-	u64 dev_offset;
 	struct btrfs_key key;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_device *device;
 	struct btrfs_chunk *chunk;
 	struct btrfs_stripe *stripe;
-	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
-	int index = 0;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	size_t item_size;
+	u64 dev_offset;
+	u64 stripe_size;
+	int i = 0;
 	int ret;
 
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		btrfs_crit(extent_root->fs_info, "unable to find logical "
+			   "%Lu len %Lu", chunk_offset, chunk_size);
+		return -EINVAL;
+	}
+
+	if (em->start != chunk_offset || em->len != chunk_size) {
+		btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+			  " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
+			  chunk_size, em->start, em->len);
+		free_extent_map(em);
+		return -EINVAL;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	item_size = btrfs_chunk_item_size(map->num_stripes);
+	stripe_size = em->orig_block_len;
+
 	chunk = kzalloc(item_size, GFP_NOFS);
-	if (!chunk)
-		return -ENOMEM;
+	if (!chunk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
 
-	index = 0;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
 		device->bytes_used += stripe_size;
 		ret = btrfs_update_device(trans, device);
-		BUG_ON(ret);
-		index++;
+		if (ret)
+			goto out;
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     chunk_root->root_key.objectid,
+					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+					     chunk_offset, dev_offset,
+					     stripe_size);
+		if (ret)
+			goto out;
 	}
 
-	index = 0;
+	spin_lock(&extent_root->fs_info->free_chunk_lock);
+	extent_root->fs_info->free_chunk_space -= (stripe_size *
+						   map->num_stripes);
+	spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
 	stripe = &chunk->stripe;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
-		dev_offset = map->stripes[index].physical;
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
 
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		stripe++;
-		index++;
 	}
 
 	btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -2751,15 +4464,19 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 	key.offset = chunk_offset;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-	BUG_ON(ret);
-
-	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		/*
+		 * TODO: Cleanup of inserted chunk root in case of
+		 * failure.
+		 */
+		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
 					     item_size);
-		BUG_ON(ret);
 	}
+
+out:
 	kfree(chunk);
-	return 0;
+	free_extent_map(em);
+	return ret;
 }
 
 /*
@@ -2773,26 +4490,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type)
 {
 	u64 chunk_offset;
-	u64 chunk_size;
-	u64 stripe_size;
-	struct map_lookup *map;
-	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-	int ret;
 
-	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-			      &chunk_offset);
-	if (ret)
-		return ret;
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-				  &stripe_size, chunk_offset, type);
-	if (ret)
-		return ret;
-
-	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-				   chunk_size, stripe_size);
-	BUG_ON(ret);
-	return 0;
+	chunk_offset = find_next_chunk(extent_root->fs_info);
+	return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
 }
 
 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -2801,60 +4501,32 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 {
 	u64 chunk_offset;
 	u64 sys_chunk_offset;
-	u64 chunk_size;
-	u64 sys_chunk_size;
-	u64 stripe_size;
-	u64 sys_stripe_size;
 	u64 alloc_profile;
-	struct map_lookup *map;
-	struct map_lookup *sys_map;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
 
-	ret = find_next_chunk(fs_info->chunk_root,
-			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
-	BUG_ON(ret);
-
-	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-			(fs_info->metadata_alloc_profile &
-			 fs_info->avail_metadata_alloc_bits);
-	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-				  &stripe_size, chunk_offset, alloc_profile);
-	BUG_ON(ret);
-
-	sys_chunk_offset = chunk_offset + chunk_size;
-
-	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-			(fs_info->system_alloc_profile &
-			 fs_info->avail_system_alloc_bits);
-	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+	chunk_offset = find_next_chunk(fs_info);
+	alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+				  alloc_profile);
+	if (ret)
+		return ret;
 
-	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-				  &sys_chunk_size, &sys_stripe_size,
-				  sys_chunk_offset, alloc_profile);
-	BUG_ON(ret);
+	sys_chunk_offset = find_next_chunk(root->fs_info);
+	alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+				  alloc_profile);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-	BUG_ON(ret);
-
-	/*
-	 * Modifying chunk tree needs allocating new blocks from both
-	 * system block group and metadata block group. So we only can
-	 * do operations require modifying the chunk tree after both
-	 * block groups were created.
-	 */
-	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-				   chunk_size, stripe_size);
-	BUG_ON(ret);
-
-	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
-				   sys_chunk_offset, sys_chunk_size,
-				   sys_stripe_size);
-	BUG_ON(ret);
-	return 0;
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+out:
+	return ret;
 }
 
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -2889,7 +4561,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
 {
-	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+	extent_map_tree_init(&tree->map_tree);
 }
 
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
@@ -2904,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 		write_unlock(&tree->map_tree.lock);
 		if (!em)
 			break;
-		kfree(em->bdev);
 		/* once for us */
 		free_extent_map(em);
 		/* once for the tree */
@@ -2912,8 +4583,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -2922,152 +4594,388 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
 	read_unlock(&em_tree->lock);
-	BUG_ON(!em);
 
-	BUG_ON(em->start > logical || em->start + em->len < logical);
+	/*
+	 * We could return errors for these cases, but that could get ugly and
+	 * we'd probably do the same thing which is just not do anything else
+	 * and exit, so return 1 so the callers don't try to use other copies.
+	 */
+	if (!em) {
+		btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
+			    logical+len);
+		return 1;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
+			    "%Lu-%Lu", logical, logical+len, em->start,
+			    em->start + em->len);
+		free_extent_map(em);
+		return 1;
+	}
+
 	map = (struct map_lookup *)em->bdev;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		ret = map->sub_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		ret = 2;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		ret = 3;
 	else
 		ret = 1;
 	free_extent_map(em);
+
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+		ret++;
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	return ret;
+}
+
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	unsigned long len = root->sectorsize;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6)) {
+		len = map->stripe_len * nr_data_stripes(map);
+	}
+	free_extent_map(em);
+	return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret = 0;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6))
+		ret = 1;
+	free_extent_map(em);
 	return ret;
 }
 
-static int find_live_mirror(struct map_lookup *map, int first, int num,
-			    int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+			    struct map_lookup *map, int first, int num,
+			    int optimal, int dev_replace_is_ongoing)
 {
 	int i;
-	if (map->stripes[optimal].dev->bdev)
-		return optimal;
-	for (i = first; i < first + num; i++) {
-		if (map->stripes[i].dev->bdev)
-			return i;
+	int tolerance;
+	struct btrfs_device *srcdev;
+
+	if (dev_replace_is_ongoing &&
+	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+		srcdev = fs_info->dev_replace.srcdev;
+	else
+		srcdev = NULL;
+
+	/*
+	 * try to avoid the drive that is the source drive for a
+	 * dev-replace procedure, only choose it if no other non-missing
+	 * mirror is available
+	 */
+	for (tolerance = 0; tolerance < 2; tolerance++) {
+		if (map->stripes[optimal].dev->bdev &&
+		    (tolerance || map->stripes[optimal].dev != srcdev))
+			return optimal;
+		for (i = first; i < first + num; i++) {
+			if (map->stripes[i].dev->bdev &&
+			    (tolerance || map->stripes[i].dev != srcdev))
+				return i;
+		}
 	}
+
 	/* we couldn't find one that doesn't fail.  Just return something
 	 * and the io error handling code will clean up eventually
 	 */
 	return optimal;
 }
 
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static inline int parity_smaller(u64 a, u64 b)
+{
+	return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+{
+	struct btrfs_bio_stripe s;
+	int i;
+	u64 l;
+	int again = 1;
+
+	while (again) {
+		again = 0;
+		for (i = 0; i < bbio->num_stripes - 1; i++) {
+			if (parity_smaller(raid_map[i], raid_map[i+1])) {
+				s = bbio->stripes[i];
+				l = raid_map[i];
+				bbio->stripes[i] = bbio->stripes[i+1];
+				raid_map[i] = raid_map[i+1];
+				bbio->stripes[i+1] = s;
+				raid_map[i+1] = l;
+				again = 1;
+			}
+		}
+	}
+}
+
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			     u64 logical, u64 *length,
-			     struct btrfs_multi_bio **multi_ret,
-			     int mirror_num, struct page *unplug_page)
+			     struct btrfs_bio **bbio_ret,
+			     int mirror_num, u64 **raid_map_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	u64 offset;
 	u64 stripe_offset;
+	u64 stripe_end_offset;
 	u64 stripe_nr;
-	int stripes_allocated = 8;
-	int stripes_required = 1;
+	u64 stripe_nr_orig;
+	u64 stripe_nr_end;
+	u64 stripe_len;
+	u64 *raid_map = NULL;
 	int stripe_index;
 	int i;
+	int ret = 0;
 	int num_stripes;
 	int max_errors = 0;
-	struct btrfs_multi_bio *multi = NULL;
-
-	if (multi_ret && !(rw & REQ_WRITE))
-		stripes_allocated = 1;
-again:
-	if (multi_ret) {
-		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
-				GFP_NOFS);
-		if (!multi)
-			return -ENOMEM;
-
-		atomic_set(&multi->error, 0);
-	}
+	struct btrfs_bio *bbio = NULL;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int dev_replace_is_ongoing = 0;
+	int num_alloc_stripes;
+	int patch_the_first_stripe_for_dev_replace = 0;
+	u64 physical_to_patch_in_first_stripe = 0;
+	u64 raid56_full_stripe_start = (u64)-1;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
 
-	if (!em && unplug_page) {
-		kfree(multi);
-		return 0;
+	if (!em) {
+		btrfs_crit(fs_info, "unable to find logical %llu len %llu",
+			logical, *length);
+		return -EINVAL;
 	}
 
-	if (!em) {
-		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
-		       (unsigned long long)logical,
-		       (unsigned long long)*length);
-		BUG();
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
+			   "found %Lu-%Lu", logical, em->start,
+			   em->start + em->len);
+		free_extent_map(em);
+		return -EINVAL;
 	}
 
-	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
-	if (mirror_num > map->num_stripes)
-		mirror_num = 0;
-
-	/* if our multi bio struct is too small, back off and try again */
-	if (rw & REQ_WRITE) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP)) {
-			stripes_required = map->num_stripes;
-			max_errors = 1;
-		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-			stripes_required = map->sub_stripes;
-			max_errors = 1;
-		}
-	}
-	if (multi_ret && (rw & REQ_WRITE) &&
-	    stripes_allocated < stripes_required) {
-		stripes_allocated = map->num_stripes;
-		free_extent_map(em);
-		kfree(multi);
-		goto again;
-	}
+	stripe_len = map->stripe_len;
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	do_div(stripe_nr, map->stripe_len);
+	do_div(stripe_nr, stripe_len);
 
-	stripe_offset = stripe_nr * map->stripe_len;
+	stripe_offset = stripe_nr * stripe_len;
 	BUG_ON(offset < stripe_offset);
 
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-			 BTRFS_BLOCK_GROUP_RAID10 |
-			 BTRFS_BLOCK_GROUP_DUP)) {
-		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-			      map->stripe_len - stripe_offset);
+	/* if we're here for raid56, we need to know the stripe aligned start */
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+		raid56_full_stripe_start = offset;
+
+		/* allow a write of a full stripe, but make sure we don't
+		 * allow straddling of stripes
+		 */
+		do_div(raid56_full_stripe_start, full_stripe_len);
+		raid56_full_stripe_start *= full_stripe_len;
+	}
+
+	if (rw & REQ_DISCARD) {
+		/* we don't discard raid56 yet */
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+		*length = min_t(u64, em->len - offset, *length);
+	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+		u64 max_len;
+		/* For writes to RAID[56], allow a full stripeset across all disks.
+		   For other RAID types and for RAID[56] reads, just allow a single
+		   stripe (on a single disk). */
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+		    (rw & REQ_WRITE)) {
+			max_len = stripe_len * nr_data_stripes(map) -
+				(offset - raid56_full_stripe_start);
+		} else {
+			/* we limit the length of each bio to what fits in a stripe */
+			max_len = stripe_len - stripe_offset;
+		}
+		*length = min_t(u64, em->len - offset, max_len);
 	} else {
 		*length = em->len - offset;
 	}
 
-	if (!multi_ret && !unplug_page)
+	/* This is for when we're called from btrfs_merge_bio_hook() and all
+	   it cares about is the length */
+	if (!bbio_ret)
 		goto out;
 
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	if (!dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
+
+	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+	    dev_replace->tgtdev != NULL) {
+		/*
+		 * in dev-replace case, for repair case (that's the only
+		 * case where the mirror is selected explicitly when
+		 * calling btrfs_map_block), blocks left of the left cursor
+		 * can also be read from the target drive.
+		 * For REQ_GET_READ_MIRRORS, the target drive is added as
+		 * the last one to the array of stripes. For READ, it also
+		 * needs to be supported using the same mirror number.
+		 * If the requested block is not left of the left cursor,
+		 * EIO is returned. This can happen because btrfs_num_copies()
+		 * returns one more in the dev-replace case.
+		 */
+		u64 tmp_length = *length;
+		struct btrfs_bio *tmp_bbio = NULL;
+		int tmp_num_stripes;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+			     logical, &tmp_length, &tmp_bbio, 0, NULL);
+		if (ret) {
+			WARN_ON(tmp_bbio != NULL);
+			goto out;
+		}
+
+		tmp_num_stripes = tmp_bbio->num_stripes;
+		if (mirror_num > tmp_num_stripes) {
+			/*
+			 * REQ_GET_READ_MIRRORS does not contain this
+			 * mirror, that means that the requested area
+			 * is not left of the left cursor
+			 */
+			ret = -EIO;
+			kfree(tmp_bbio);
+			goto out;
+		}
+
+		/*
+		 * process the rest of the function using the mirror_num
+		 * of the source drive. Therefore look it up first.
+		 * At the end, patch the device pointer to the one of the
+		 * target drive.
+		 */
+		for (i = 0; i < tmp_num_stripes; i++) {
+			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     tmp_bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found =
+					tmp_bbio->stripes[i].physical;
+			}
+		}
+
+		if (found) {
+			mirror_num = index_srcdev + 1;
+			patch_the_first_stripe_for_dev_replace = 1;
+			physical_to_patch_in_first_stripe = physical_of_found;
+		} else {
+			WARN_ON(1);
+			ret = -EIO;
+			kfree(tmp_bbio);
+			goto out;
+		}
+
+		kfree(tmp_bbio);
+	} else if (mirror_num > map->num_stripes) {
+		mirror_num = 0;
+	}
+
 	num_stripes = 1;
 	stripe_index = 0;
-	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (unplug_page || (rw & REQ_WRITE))
+	stripe_nr_orig = stripe_nr;
+	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
+	do_div(stripe_nr_end, map->stripe_len);
+	stripe_end_offset = stripe_nr_end * map->stripe_len -
+			    (offset + *length);
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		if (rw & REQ_DISCARD)
+			num_stripes = min_t(u64, map->num_stripes,
+					    stripe_nr_end - stripe_nr_orig);
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 		else {
-			stripe_index = find_live_mirror(map, 0,
+			stripe_index = find_live_mirror(fs_info, map, 0,
 					    map->num_stripes,
-					    current->pid % map->num_stripes);
+					    current->pid % map->num_stripes,
+					    dev_replace_is_ongoing);
+			mirror_num = stripe_index + 1;
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & REQ_WRITE)
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
 			num_stripes = map->num_stripes;
-		else if (mirror_num)
+		} else if (mirror_num) {
 			stripe_index = mirror_num - 1;
+		} else {
+			mirror_num = 1;
+		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
@@ -3075,14 +4983,81 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (unplug_page || (rw & REQ_WRITE))
+		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
 			num_stripes = map->sub_stripes;
+		else if (rw & REQ_DISCARD)
+			num_stripes = min_t(u64, map->sub_stripes *
+					    (stripe_nr_end - stripe_nr_orig),
+					    map->num_stripes);
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
 		else {
-			stripe_index = find_live_mirror(map, stripe_index,
+			int old_stripe_index = stripe_index;
+			stripe_index = find_live_mirror(fs_info, map,
+					      stripe_index,
 					      map->sub_stripes, stripe_index +
-					      current->pid % map->sub_stripes);
+					      current->pid % map->sub_stripes,
+					      dev_replace_is_ongoing);
+			mirror_num = stripe_index - old_stripe_index + 1;
+		}
+
+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6)) {
+		u64 tmp;
+
+		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+		    && raid_map_ret) {
+			int i, rot;
+
+			/* push stripe_nr back to the start of the full stripe */
+			stripe_nr = raid56_full_stripe_start;
+			do_div(stripe_nr, stripe_len);
+
+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+
+			/* RAID[56] write or recovery. Return all stripes */
+			num_stripes = map->num_stripes;
+			max_errors = nr_parity_stripes(map);
+
+			raid_map = kmalloc_array(num_stripes, sizeof(u64),
+					   GFP_NOFS);
+			if (!raid_map) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			/* Work out the disk rotation on this stripe-set */
+			tmp = stripe_nr;
+			rot = do_div(tmp, num_stripes);
+
+			/* Fill in the logical address of each stripe */
+			tmp = stripe_nr * nr_data_stripes(map);
+			for (i = 0; i < nr_data_stripes(map); i++)
+				raid_map[(i+rot) % num_stripes] =
+					em->start + (tmp + i) * map->stripe_len;
+
+			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+			if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+				raid_map[(i+rot+1) % num_stripes] =
+					RAID6_Q_STRIPE;
+
+			*length = map->stripe_len;
+			stripe_index = 0;
+			stripe_offset = 0;
+		} else {
+			/*
+			 * Mirror #0 or #1 means the original data block.
+			 * Mirror #2 is RAID5 parity block.
+			 * Mirror #3 is RAID6 Q block.
+			 */
+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+			if (mirror_num > 1)
+				stripe_index = nr_data_stripes(map) +
+						mirror_num - 2;
+
+			/* We distribute the parity blocks across stripes */
+			tmp = stripe_nr + stripe_index;
+			stripe_index = do_div(tmp, map->num_stripes);
 		}
 	} else {
 		/*
@@ -3091,43 +5066,229 @@ again:
 		 * stripe_index is the number of our device in the stripe array
 		 */
 		stripe_index = do_div(stripe_nr, map->num_stripes);
+		mirror_num = stripe_index + 1;
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
-	for (i = 0; i < num_stripes; i++) {
-		if (unplug_page) {
-			struct btrfs_device *device;
-			struct backing_dev_info *bdi;
-
-			device = map->stripes[stripe_index].dev;
-			if (device->bdev) {
-				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn)
-					bdi->unplug_io_fn(bdi, unplug_page);
-			}
-		} else {
-			multi->stripes[i].physical =
+	num_alloc_stripes = num_stripes;
+	if (dev_replace_is_ongoing) {
+		if (rw & (REQ_WRITE | REQ_DISCARD))
+			num_alloc_stripes <<= 1;
+		if (rw & REQ_GET_READ_MIRRORS)
+			num_alloc_stripes++;
+	}
+	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
+	if (!bbio) {
+		kfree(raid_map);
+		ret = -ENOMEM;
+		goto out;
+	}
+	atomic_set(&bbio->error, 0);
+
+	if (rw & REQ_DISCARD) {
+		int factor = 0;
+		int sub_stripes = 0;
+		u64 stripes_per_dev = 0;
+		u32 remaining_stripes = 0;
+		u32 last_stripe = 0;
+
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+				sub_stripes = 1;
+			else
+				sub_stripes = map->sub_stripes;
+
+			factor = map->num_stripes / sub_stripes;
+			stripes_per_dev = div_u64_rem(stripe_nr_end -
+						      stripe_nr_orig,
+						      factor,
+						      &remaining_stripes);
+			div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+			last_stripe *= sub_stripes;
+		}
+
+		for (i = 0; i < num_stripes; i++) {
+			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
-			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+					 BTRFS_BLOCK_GROUP_RAID10)) {
+				bbio->stripes[i].length = stripes_per_dev *
+							  map->stripe_len;
+
+				if (i / sub_stripes < remaining_stripes)
+					bbio->stripes[i].length +=
+						map->stripe_len;
+
+				/*
+				 * Special for the first stripe and
+				 * the last stripe:
+				 *
+				 * |-------|...|-------|
+				 *     |----------|
+				 *    off     end_off
+				 */
+				if (i < sub_stripes)
+					bbio->stripes[i].length -=
+						stripe_offset;
+
+				if (stripe_index >= last_stripe &&
+				    stripe_index <= (last_stripe +
+						     sub_stripes - 1))
+					bbio->stripes[i].length -=
+						stripe_end_offset;
+
+				if (i == sub_stripes - 1)
+					stripe_offset = 0;
+			} else
+				bbio->stripes[i].length = *length;
+
+			stripe_index++;
+			if (stripe_index == map->num_stripes) {
+				/* This could only happen for RAID0/10 */
+				stripe_index = 0;
+				stripe_nr++;
+			}
+		}
+	} else {
+		for (i = 0; i < num_stripes; i++) {
+			bbio->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset +
+				stripe_nr * map->stripe_len;
+			bbio->stripes[i].dev =
+				map->stripes[stripe_index].dev;
+			stripe_index++;
 		}
-		stripe_index++;
 	}
-	if (multi_ret) {
-		*multi_ret = multi;
-		multi->num_stripes = num_stripes;
-		multi->max_errors = max_errors;
+
+	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_RAID10 |
+				 BTRFS_BLOCK_GROUP_RAID5 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			max_errors = 1;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+			max_errors = 2;
+		}
+	}
+
+	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	    dev_replace->tgtdev != NULL) {
+		int index_where_to_add;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+
+		/*
+		 * duplicate the write operations while the dev replace
+		 * procedure is running. Since the copying of the old disk
+		 * to the new disk takes place at run time while the
+		 * filesystem is mounted writable, the regular write
+		 * operations to the old disk have to be duplicated to go
+		 * to the new disk as well.
+		 * Note that device->missing is handled by the caller, and
+		 * that the write to the old disk is already set up in the
+		 * stripes array.
+		 */
+		index_where_to_add = num_stripes;
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/* write to new disk, too */
+				struct btrfs_bio_stripe *new =
+					bbio->stripes + index_where_to_add;
+				struct btrfs_bio_stripe *old =
+					bbio->stripes + i;
+
+				new->physical = old->physical;
+				new->length = old->length;
+				new->dev = dev_replace->tgtdev;
+				index_where_to_add++;
+				max_errors++;
+			}
+		}
+		num_stripes = index_where_to_add;
+	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+		   dev_replace->tgtdev != NULL) {
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		/*
+		 * During the dev-replace procedure, the target drive can
+		 * also be used to read data in case it is needed to repair
+		 * a corrupt block elsewhere. This is possible if the
+		 * requested area is left of the left cursor. In this area,
+		 * the target drive is a full copy of the source drive.
+		 */
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found = bbio->stripes[i].physical;
+			}
+		}
+		if (found) {
+			u64 length = map->stripe_len;
+
+			if (physical_of_found + length <=
+			    dev_replace->cursor_left) {
+				struct btrfs_bio_stripe *tgtdev_stripe =
+					bbio->stripes + num_stripes;
+
+				tgtdev_stripe->physical = physical_of_found;
+				tgtdev_stripe->length =
+					bbio->stripes[index_srcdev].length;
+				tgtdev_stripe->dev = dev_replace->tgtdev;
+
+				num_stripes++;
+			}
+		}
+	}
+
+	*bbio_ret = bbio;
+	bbio->num_stripes = num_stripes;
+	bbio->max_errors = max_errors;
+	bbio->mirror_num = mirror_num;
+
+	/*
+	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
+	 * mirror_num == num_stripes + 1 && dev_replace target drive is
+	 * available as a mirror
+	 */
+	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+		WARN_ON(num_stripes > 1);
+		bbio->stripes[0].dev = dev_replace->tgtdev;
+		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+		bbio->mirror_num = map->num_stripes + 1;
+	}
+	if (raid_map) {
+		sort_parity_stripes(bbio, raid_map);
+		*raid_map_ret = raid_map;
 	}
 out:
+	if (dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
 	free_extent_map(em);
-	return 0;
+	return ret;
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		      u64 logical, u64 *length,
-		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+		      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
 				 mirror_num, NULL);
 }
 
@@ -3142,23 +5303,42 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	u64 bytenr;
 	u64 length;
 	u64 stripe_nr;
+	u64 rmap_len;
 	int i, j, nr = 0;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
 	read_unlock(&em_tree->lock);
 
-	BUG_ON(!em || em->start != chunk_start);
+	if (!em) {
+		printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
+		       chunk_start);
+		return -EIO;
+	}
+
+	if (em->start != chunk_start) {
+		printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
+		       em->start, chunk_start);
+		free_extent_map(em);
+		return -EIO;
+	}
 	map = (struct map_lookup *)em->bdev;
 
 	length = em->len;
+	rmap_len = map->stripe_len;
+
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		do_div(length, map->num_stripes / map->sub_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
 		do_div(length, map->num_stripes);
+	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			      BTRFS_BLOCK_GROUP_RAID6)) {
+		do_div(length, nr_data_stripes(map));
+		rmap_len = map->stripe_len * nr_data_stripes(map);
+	}
 
 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
-	BUG_ON(!buf);
+	BUG_ON(!buf); /* -ENOMEM */
 
 	for (i = 0; i < map->num_stripes; i++) {
 		if (devid && map->stripes[i].dev->devid != devid)
@@ -3175,8 +5355,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			do_div(stripe_nr, map->sub_stripes);
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
-		}
-		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		} /* else if RAID[56], multiply by nr_data_stripes().
+		   * Alternatively, just use rmap_len below instead of
+		   * map->stripe_len */
+
+		bytenr = chunk_start + stripe_nr * rmap_len;
 		WARN_ON(nr >= map->num_stripes);
 		for (j = 0; j < nr; j++) {
 			if (buf[j] == bytenr)
@@ -3190,44 +5373,70 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 	*logical = buf;
 	*naddrs = nr;
-	*stripe_len = map->stripe_len;
+	*stripe_len = rmap_len;
 
 	free_extent_map(em);
 	return 0;
 }
 
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-		      u64 logical, struct page *page)
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
 {
-	u64 length = PAGE_CACHE_SIZE;
-	return __btrfs_map_block(map_tree, READ, logical, &length,
-				 NULL, 0, page);
+	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+		bio_endio_nodec(bio, err);
+	else
+		bio_endio(bio, err);
+	kfree(bbio);
 }
 
-static void end_bio_multi_stripe(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err)
 {
-	struct btrfs_multi_bio *multi = bio->bi_private;
+	struct btrfs_bio *bbio = bio->bi_private;
+	struct btrfs_device *dev = bbio->stripes[0].dev;
 	int is_orig_bio = 0;
 
-	if (err)
-		atomic_inc(&multi->error);
+	if (err) {
+		atomic_inc(&bbio->error);
+		if (err == -EIO || err == -EREMOTEIO) {
+			unsigned int stripe_index =
+				btrfs_io_bio(bio)->stripe_index;
+
+			BUG_ON(stripe_index >= bbio->num_stripes);
+			dev = bbio->stripes[stripe_index].dev;
+			if (dev->bdev) {
+				if (bio->bi_rw & WRITE)
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_WRITE_ERRS);
+				else
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_READ_ERRS);
+				if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_FLUSH_ERRS);
+				btrfs_dev_stat_print_on_error(dev);
+			}
+		}
+	}
 
-	if (bio == multi->orig_bio)
+	if (bio == bbio->orig_bio)
 		is_orig_bio = 1;
 
-	if (atomic_dec_and_test(&multi->stripes_pending)) {
+	btrfs_bio_counter_dec(bbio->fs_info);
+
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
 		if (!is_orig_bio) {
 			bio_put(bio);
-			bio = multi->orig_bio;
+			bio = bbio->orig_bio;
 		}
-		bio->bi_private = multi->private;
-		bio->bi_end_io = multi->end_io;
+
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the multi-bio
+		 * beyond the tolerance of the btrfs bio
 		 */
-		if (atomic_read(&multi->error) > multi->max_errors) {
+		if (atomic_read(&bbio->error) > bbio->max_errors) {
 			err = -EIO;
-		} else if (err) {
+		} else {
 			/*
 			 * this bio is actually up to date, we didn't
 			 * go over the max number of errors
@@ -3235,21 +5444,13 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
 			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			err = 0;
 		}
-		kfree(multi);
 
-		bio_endio(bio, err);
+		btrfs_end_bbio(bbio, bio, err);
 	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
 }
 
-struct async_sched {
-	struct bio *bio;
-	int rw;
-	struct btrfs_fs_info *info;
-	struct btrfs_work work;
-};
-
 /*
  * see run_scheduled_bios for a description of why bios are collected for
  * async submit.
@@ -3257,19 +5458,24 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static noinline int schedule_bio(struct btrfs_root *root,
-				 struct btrfs_device *device,
-				 int rw, struct bio *bio)
+static noinline void btrfs_schedule_bio(struct btrfs_root *root,
+					struct btrfs_device *device,
+					int rw, struct bio *bio)
 {
 	int should_queue = 1;
 	struct btrfs_pending_bios *pending_bios;
 
+	if (device->missing || !device->bdev) {
+		bio_endio(bio, -EIO);
+		return;
+	}
+
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 		bio_put(bio);
-		return 0;
+		return;
 	}
 
 	/*
@@ -3301,84 +5507,221 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	spin_unlock(&device->io_lock);
 
 	if (should_queue)
-		btrfs_queue_worker(&root->fs_info->submit_workers,
-				   &device->work);
+		btrfs_queue_work(root->fs_info->submit_workers,
+				 &device->work);
+}
+
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+		       sector_t sector)
+{
+	struct bio_vec *prev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int max_sectors = queue_max_sectors(q);
+	struct bvec_merge_data bvm = {
+		.bi_bdev = bdev,
+		.bi_sector = sector,
+		.bi_rw = bio->bi_rw,
+	};
+
+	if (WARN_ON(bio->bi_vcnt == 0))
+		return 1;
+
+	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	if (bio_sectors(bio) > max_sectors)
+		return 0;
+
+	if (!q->merge_bvec_fn)
+		return 1;
+
+	bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
+	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+		return 0;
+	return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *bio, u64 physical, int dev_nr,
+			      int rw, int async)
+{
+	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+	bio->bi_private = bbio;
+	btrfs_io_bio(bio)->stripe_index = dev_nr;
+	bio->bi_end_io = btrfs_end_bio;
+	bio->bi_iter.bi_sector = physical >> 9;
+#ifdef DEBUG
+	{
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+			 "(%s id %llu), size=%u\n", rw,
+			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+			 name->str, dev->devid, bio->bi_size);
+		rcu_read_unlock();
+	}
+#endif
+	bio->bi_bdev = dev->bdev;
+
+	btrfs_bio_counter_inc_noblocked(root->fs_info);
+
+	if (async)
+		btrfs_schedule_bio(root, dev, rw, bio);
+	else
+		btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *first_bio, struct btrfs_device *dev,
+			      int dev_nr, int rw, int async)
+{
+	struct bio_vec *bvec = first_bio->bi_io_vec;
+	struct bio *bio;
+	int nr_vecs = bio_get_nr_vecs(dev->bdev);
+	u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+
+	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len) {
+			u64 len = bio->bi_iter.bi_size;
+
+			atomic_inc(&bbio->stripes_pending);
+			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+					  rw, async);
+			physical += len;
+			goto again;
+		}
+		bvec++;
+	}
+
+	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
 	return 0;
 }
 
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+	atomic_inc(&bbio->error);
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+		/* Shoud be the original bio. */
+		WARN_ON(bio != bbio->orig_bio);
+
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+		bio->bi_iter.bi_sector = logical >> 9;
+
+		btrfs_end_bbio(bbio, bio, -EIO);
+	}
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit)
 {
-	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
-	u64 logical = (u64)bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct btrfs_multi_bio *multi = NULL;
+	u64 *raid_map = NULL;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
+	struct btrfs_bio *bbio = NULL;
 
-	length = bio->bi_size;
-	map_tree = &root->fs_info->mapping_tree;
+	length = bio->bi_iter.bi_size;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
-			      mirror_num);
-	BUG_ON(ret);
+	btrfs_bio_counter_inc_blocked(root->fs_info);
+	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+			      mirror_num, &raid_map);
+	if (ret) {
+		btrfs_bio_counter_dec(root->fs_info);
+		return ret;
+	}
+
+	total_devs = bbio->num_stripes;
+	bbio->orig_bio = first_bio;
+	bbio->private = first_bio->bi_private;
+	bbio->end_io = first_bio->bi_end_io;
+	bbio->fs_info = root->fs_info;
+	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+
+	if (raid_map) {
+		/* In this case, map_length has been set to the length of
+		   a single stripe; not the whole write */
+		if (rw & WRITE) {
+			ret = raid56_parity_write(root, bio, bbio,
+						  raid_map, map_length);
+		} else {
+			ret = raid56_parity_recover(root, bio, bbio,
+						    raid_map, map_length,
+						    mirror_num);
+		}
+		/*
+		 * FIXME, replace dosen't support raid56 yet, please fix
+		 * it in the future.
+		 */
+		btrfs_bio_counter_dec(root->fs_info);
+		return ret;
+	}
 
-	total_devs = multi->num_stripes;
 	if (map_length < length) {
-		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
-		       "len %llu\n", (unsigned long long)logical,
-		       (unsigned long long)length,
-		       (unsigned long long)map_length);
+		btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
+			logical, length, map_length);
 		BUG();
 	}
-	multi->end_io = first_bio->bi_end_io;
-	multi->private = first_bio->bi_private;
-	multi->orig_bio = first_bio;
-	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
 	while (dev_nr < total_devs) {
-		if (total_devs > 1) {
-			if (dev_nr < total_devs - 1) {
-				bio = bio_clone(first_bio, GFP_NOFS);
-				BUG_ON(!bio);
-			} else {
-				bio = first_bio;
-			}
-			bio->bi_private = multi;
-			bio->bi_end_io = end_bio_multi_stripe;
-		}
-		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
-		dev = multi->stripes[dev_nr].dev;
-		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-			bio->bi_bdev = dev->bdev;
-			if (async_submit)
-				schedule_bio(root, dev, rw, bio);
-			else
-				submit_bio(rw, bio);
+		dev = bbio->stripes[dev_nr].dev;
+		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+			bbio_error(bbio, first_bio, logical);
+			dev_nr++;
+			continue;
+		}
+
+		/*
+		 * Check and see if we're ok with this bio based on it's size
+		 * and offset with the given device.
+		 */
+		if (!bio_size_ok(dev->bdev, first_bio,
+				 bbio->stripes[dev_nr].physical >> 9)) {
+			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+						 dev_nr, rw, async_submit);
+			BUG_ON(ret);
+			dev_nr++;
+			continue;
+		}
+
+		if (dev_nr < total_devs - 1) {
+			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
+			BUG_ON(!bio); /* -ENOMEM */
 		} else {
-			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-			bio->bi_sector = logical >> 9;
-			bio_endio(bio, -EIO);
+			bio = first_bio;
+			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
 		}
+
+		submit_stripe_bio(root, bbio, bio,
+				  bbio->stripes[dev_nr].physical, dev_nr, rw,
+				  async_submit);
 		dev_nr++;
 	}
-	if (total_devs == 1)
-		kfree(multi);
+	btrfs_bio_counter_dec(root->fs_info);
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid)
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *cur_devices;
 
-	cur_devices = root->fs_info->fs_devices;
+	cur_devices = fs_info->fs_devices;
 	while (cur_devices) {
 		if (!fsid ||
 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -3398,24 +5741,70 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device)
+	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+	if (IS_ERR(device))
 		return NULL;
-	list_add(&device->dev_list,
-		 &fs_devices->devices);
-	device->dev_root = root->fs_info->dev_root;
-	device->devid = devid;
-	device->work.func = pending_bios_fn;
+
+	list_add(&device->dev_list, &fs_devices->devices);
 	device->fs_devices = fs_devices;
-	device->missing = 1;
 	fs_devices->num_devices++;
+
+	device->missing = 1;
 	fs_devices->missing_devices++;
-	spin_lock_init(&device->io_lock);
-	INIT_LIST_HEAD(&device->dev_alloc_list);
-	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+
 	return device;
 }
 
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info:	used only for generating a new devid, can be NULL if
+ *		devid is provided (i.e. @devid != NULL).
+ * @devid:	a pointer to devid for this device.  If NULL a new devid
+ *		is generated.
+ * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
+ *		is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error.  Returned struct is not linked onto any lists and can be
+ * destroyed with kfree() right away.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid)
+{
+	struct btrfs_device *dev;
+	u64 tmp;
+
+	if (WARN_ON(!devid && !fs_info))
+		return ERR_PTR(-EINVAL);
+
+	dev = __alloc_device();
+	if (IS_ERR(dev))
+		return dev;
+
+	if (devid)
+		tmp = *devid;
+	else {
+		int ret;
+
+		ret = find_next_devid(fs_info, &tmp);
+		if (ret) {
+			kfree(dev);
+			return ERR_PTR(ret);
+		}
+	}
+	dev->devid = tmp;
+
+	if (uuid)
+		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+	else
+		generate_random_uuid(dev->uuid);
+
+	btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
+
+	return dev;
+}
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -3446,7 +5835,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		free_extent_map(em);
 	}
 
-	em = alloc_extent_map(GFP_NOFS);
+	em = alloc_extent_map();
 	if (!em)
 		return -ENOMEM;
 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3456,9 +5845,11 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		return -ENOMEM;
 	}
 
+	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
 	em->bdev = (struct block_device *)map;
 	em->start = logical;
 	em->len = length;
+	em->orig_start = 0;
 	em->block_start = 0;
 	em->block_len = em->len;
 
@@ -3476,10 +5867,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
-							NULL);
+		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+							uuid, NULL);
 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
-			kfree(map);
 			free_extent_map(em);
 			return -EIO;
 		}
@@ -3487,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			map->stripes[i].dev =
 				add_missing_dev(root, devid, uuid);
 			if (!map->stripes[i].dev) {
-				kfree(map);
 				free_extent_map(em);
 				return -EIO;
 			}
@@ -3496,15 +5885,15 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 	write_lock(&map_tree->map_tree.lock);
-	ret = add_extent_mapping(&map_tree->map_tree, em);
+	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
 	write_unlock(&map_tree->map_tree.lock);
-	BUG_ON(ret);
+	BUG_ON(ret); /* Tree corruption */
 	free_extent_map(em);
 
 	return 0;
 }
 
-static int fill_device_from_item(struct extent_buffer *leaf,
+static void fill_device_from_item(struct extent_buffer *leaf,
 				 struct btrfs_dev_item *dev_item,
 				 struct btrfs_device *device)
 {
@@ -3518,11 +5907,11 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+	device->is_tgtdev_for_dev_replace = 0;
 
-	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	ptr = btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-
-	return 0;
 }
 
 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
@@ -3530,7 +5919,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	struct btrfs_fs_devices *fs_devices;
 	int ret;
 
-	mutex_lock(&uuid_mutex);
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
 
 	fs_devices = root->fs_info->fs_devices->seed;
 	while (fs_devices) {
@@ -3555,8 +5944,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 
 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
 				   root->fs_info->bdev_holder);
-	if (ret)
+	if (ret) {
+		free_fs_devices(fs_devices);
 		goto out;
+	}
 
 	if (!fs_devices->seeding) {
 		__btrfs_close_devices(fs_devices);
@@ -3568,7 +5959,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
 out:
-	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -3583,11 +5973,9 @@ static int read_one_dev(struct btrfs_root *root,
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
-	read_extent_buffer(leaf, dev_uuid,
-			   (unsigned long)btrfs_device_uuid(dev_item),
+	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
-	read_extent_buffer(leaf, fs_uuid,
-			   (unsigned long)btrfs_device_fsid(dev_item),
+	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 			   BTRFS_UUID_SIZE);
 
 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
@@ -3596,14 +5984,13 @@ static int read_one_dev(struct btrfs_root *root,
 			return ret;
 	}
 
-	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
 	if (!device || !device->bdev) {
 		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
 		if (!device) {
-			printk(KERN_WARNING "warning devid %llu missing\n",
-			       (unsigned long long)devid);
+			btrfs_warn(root->fs_info, "devid %llu missing", devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -3627,26 +6014,21 @@ static int read_one_dev(struct btrfs_root *root,
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
-	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
-	if (device->writeable)
+	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += device->total_bytes -
+			device->bytes_used;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = 0;
 	return ret;
 }
 
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
-{
-	struct btrfs_dev_item *dev_item;
-
-	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
-						     dev_item);
-	return read_one_dev(root, buf, dev_item);
-}
-
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
@@ -3664,7 +6046,21 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	if (!sb)
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
-	btrfs_set_buffer_lockdep_class(sb, 0);
+	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
+	/*
+	 * The sb extent buffer is artifical and just used to read the system array.
+	 * btrfs_set_buffer_uptodate() call does not properly mark all it's
+	 * pages up-to-date when the page is larger: extent does not cover the
+	 * whole page and consequently check_page_uptodate does not find all
+	 * the page's extents up-to-date (the hole beyond sb),
+	 * write_extent_buffer then triggers a WARN_ON.
+	 *
+	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
+	 * but sb spans only this function. Add an explicit SetPageUptodate call
+	 * to silence the warning eg. on PowerPC 64.
+	 */
+	if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+		SetPageUptodate(sb->pages[0]);
 
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
@@ -3715,14 +6111,18 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	/* first we search for all of the device items, and then we
-	 * read in all of the chunk items.  This way we can create chunk
-	 * mappings that reference all of the devices that are afound
+	mutex_lock(&uuid_mutex);
+	lock_chunks(root);
+
+	/*
+	 * Read all device items, and then all the chunk items. All
+	 * device items are found before any chunk item (their object id
+	 * is smaller than the lowest possible object id for a chunk
+	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
 	 */
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.offset = 0;
 	key.type = 0;
-again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
@@ -3738,17 +6138,13 @@ again:
 			break;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
-			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
-				break;
-			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
-				struct btrfs_dev_item *dev_item;
-				dev_item = btrfs_item_ptr(leaf, slot,
+		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+			struct btrfs_dev_item *dev_item;
+			dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-				ret = read_one_dev(root, leaf, dev_item);
-				if (ret)
-					goto error;
-			}
+			ret = read_one_dev(root, leaf, dev_item);
+			if (ret)
+				goto error;
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -3758,13 +6154,276 @@ again:
 		}
 		path->slots[0]++;
 	}
-	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
-		key.objectid = 0;
-		btrfs_release_path(root, path);
-		goto again;
-	}
 	ret = 0;
 error:
+	unlock_chunks(root);
+	mutex_unlock(&uuid_mutex);
+
 	btrfs_free_path(path);
 	return ret;
 }
+
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+
+	while (fs_devices) {
+		mutex_lock(&fs_devices->device_list_mutex);
+		list_for_each_entry(device, &fs_devices->devices, dev_list)
+			device->dev_root = fs_info->dev_root;
+		mutex_unlock(&fs_devices->device_list_mutex);
+
+		fs_devices = fs_devices->seed;
+	}
+}
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_device *device;
+	struct btrfs_path *path = NULL;
+	int i;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		int item_size;
+		struct btrfs_dev_stats_item *ptr;
+
+		key.objectid = 0;
+		key.type = BTRFS_DEV_STATS_KEY;
+		key.offset = device->devid;
+		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+		if (ret) {
+			__btrfs_reset_dev_stats(device);
+			device->dev_stats_valid = 1;
+			btrfs_release_path(path);
+			continue;
+		}
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		item_size = btrfs_item_size_nr(eb, slot);
+
+		ptr = btrfs_item_ptr(eb, slot,
+				     struct btrfs_dev_stats_item);
+
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+			if (item_size >= (1 + i) * sizeof(__le64))
+				btrfs_dev_stat_set(device, i,
+					btrfs_dev_stats_value(eb, ptr, i));
+			else
+				btrfs_dev_stat_reset(device, i);
+		}
+
+		device->dev_stats_valid = 1;
+		btrfs_dev_stat_print_on_load(device);
+		btrfs_release_path(path);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+	btrfs_free_path(path);
+	return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+				struct btrfs_root *dev_root,
+				struct btrfs_device *device)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_stats_item *ptr;
+	int ret;
+	int i;
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_STATS_KEY;
+	key.offset = device->devid;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		printk_in_rcu(KERN_WARNING "BTRFS: "
+			"error %d while searching for dev_stats item for device %s!\n",
+			      ret, rcu_str_deref(device->name));
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/* need to delete old one and insert a new one */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			printk_in_rcu(KERN_WARNING "BTRFS: "
+				"delete too small dev_stats item for device %s failed %d!\n",
+				      rcu_str_deref(device->name), ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			printk_in_rcu(KERN_WARNING "BTRFS: "
+					  "insert dev_stats item for device %s failed %d!\n",
+				      rcu_str_deref(device->name), ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		btrfs_set_dev_stats_value(eb, ptr, i,
+					  btrfs_dev_stat_read(device, i));
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	int ret = 0;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->dev_stats_valid || !device->dev_stats_dirty)
+			continue;
+
+		ret = update_dev_stat_item(trans, dev_root, device);
+		if (!ret)
+			device->dev_stats_dirty = 0;
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+	btrfs_dev_stat_inc(dev, index);
+	btrfs_dev_stat_print_on_error(dev);
+}
+
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+	if (!dev->dev_stats_valid)
+		return;
+	printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+			   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+			   rcu_str_deref(dev->name),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		if (btrfs_dev_stat_read(dev, i) != 0)
+			break;
+	if (i == BTRFS_DEV_STAT_VALUES_MAX)
+		return; /* all values == 0, suppress message */
+
+	printk_in_rcu(KERN_INFO "BTRFS: "
+		   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	       rcu_str_deref(dev->name),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+			struct btrfs_ioctl_get_dev_stats *stats)
+{
+	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int i;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	if (!dev) {
+		btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
+		return -ENODEV;
+	} else if (!dev->dev_stats_valid) {
+		btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
+		return -ENODEV;
+	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+			if (stats->nr_items > i)
+				stats->values[i] =
+					btrfs_dev_stat_read_and_reset(dev, i);
+			else
+				btrfs_dev_stat_reset(dev, i);
+		}
+	} else {
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+			if (stats->nr_items > i)
+				stats->values[i] = btrfs_dev_stat_read(dev, i);
+	}
+	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+	return 0;
+}
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+
+	bh = btrfs_read_dev_super(device->bdev);
+	if (!bh)
+		return -EINVAL;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7fb59d45fe8..2aaa00c4781 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,6 +21,7 @@
 
 #include <linux/bio.h>
 #include <linux/sort.h>
+#include <linux/btrfs.h>
 #include "async-thread.h"
 
 #define BTRFS_STRIPE_LEN	(64 * 1024)
@@ -42,21 +43,22 @@ struct btrfs_device {
 	/* WRITE_SYNC bios */
 	struct btrfs_pending_bios pending_sync_bios;
 
-	int running_pending;
 	u64 generation;
-
+	int running_pending;
 	int writeable;
 	int in_fs_metadata;
 	int missing;
+	int can_discard;
+	int is_tgtdev_for_dev_replace;
 
 	spinlock_t io_lock;
+	/* the mode sent to blkdev_get */
+	fmode_t mode;
 
 	struct block_device *bdev;
 
-	/* the mode sent to blkdev_get */
-	fmode_t mode;
 
-	char *name;
+	struct rcu_string *name;
 
 	/* the internal btrfs device id */
 	u64 devid;
@@ -75,17 +77,42 @@ struct btrfs_device {
 
 	/* optimal io width for this device */
 	u32 io_width;
+	/* type and info about this device */
+	u64 type;
 
 	/* minimal io size for this device */
 	u32 sector_size;
 
-	/* type and info about this device */
-	u64 type;
 
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_UUID_SIZE];
 
+	/* for sending down flush barriers */
+	int nobarriers;
+	struct bio *flush_bio;
+	struct completion flush_wait;
+
+	/* per-device scrub information */
+	struct scrub_ctx *scrub_device;
+
 	struct btrfs_work work;
+	struct rcu_head rcu;
+	struct work_struct rcu_work;
+
+	/* readahead state */
+	spinlock_t reada_lock;
+	atomic_t reada_in_flight;
+	u64 reada_next;
+	struct reada_zone *reada_curr_zone;
+	struct radix_tree_root reada_zones;
+	struct radix_tree_root reada_extents;
+
+
+	/* disk I/O failure stats. For detailed description refer to
+	 * enum btrfs_dev_stat_values in ioctl.h */
+	int dev_stats_valid;
+	int dev_stats_dirty; /* counters need to be written to disk */
+	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
 };
 
 struct btrfs_fs_devices {
@@ -99,11 +126,15 @@ struct btrfs_fs_devices {
 	u64 rw_devices;
 	u64 missing_devices;
 	u64 total_rw_bytes;
+	u64 num_can_discard;
+	u64 total_devices;
 	struct block_device *latest_bdev;
 
 	/* all of the devices in the FS, protected by a mutex
 	 * so we can safely walk it to write out the supers without
-	 * worrying about add/remove by the multi-device code
+	 * worrying about add/remove by the multi-device code.
+	 * Scrubbing super can kick off supers writing by holding
+	 * this mutex lock.
 	 */
 	struct mutex device_list_mutex;
 	struct list_head devices;
@@ -123,19 +154,55 @@ struct btrfs_fs_devices {
 	int rotating;
 };
 
+#define BTRFS_BIO_INLINE_CSUM_SIZE	64
+
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet.  We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios.  We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
+struct btrfs_io_bio {
+	unsigned long mirror_num;
+	unsigned long stripe_index;
+	u8 *csum;
+	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+	u8 *csum_allocated;
+	btrfs_io_bio_end_io_t *end_io;
+	struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+	return container_of(bio, struct btrfs_io_bio, bio);
+}
+
 struct btrfs_bio_stripe {
 	struct btrfs_device *dev;
 	u64 physical;
+	u64 length; /* only used for discard mappings */
 };
 
-struct btrfs_multi_bio {
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED	0x1
+
+struct btrfs_bio {
 	atomic_t stripes_pending;
+	struct btrfs_fs_info *fs_info;
 	bio_end_io_t *end_io;
 	struct bio *orig_bio;
+	unsigned long flags;
 	void *private;
 	atomic_t error;
 	int max_errors;
 	int num_stripes;
+	int mirror_num;
 	struct btrfs_bio_stripe stripes[];
 };
 
@@ -143,36 +210,87 @@ struct btrfs_device_info {
 	struct btrfs_device *dev;
 	u64 dev_offset;
 	u64 max_avail;
+	u64 total_avail;
 };
 
-/* Used to sort the devices by max_avail(descending sort) */
-int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+struct btrfs_raid_attr {
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+};
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	int sub_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
 /*
- * sort the devices by max_avail, in which max free extent size of each device
- * is stored.(Descending Sort)
+ * Restriper's general type filter
  */
-static inline void btrfs_descending_sort_devices(
-					struct btrfs_device_info *devices,
-					size_t nr_devices)
-{
-	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
-	     btrfs_cmp_device_free_bytes, NULL);
-}
+#define BTRFS_BALANCE_DATA		(1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM		(1ULL << 1)
+#define BTRFS_BALANCE_METADATA		(1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
+					 BTRFS_BALANCE_SYSTEM |	    \
+					 BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE		(1ULL << 3)
+#define BTRFS_BALANCE_RESUME		(1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+	struct btrfs_fs_info *fs_info;
+
+	struct btrfs_balance_args data;
+	struct btrfs_balance_args meta;
+	struct btrfs_balance_args sys;
+
+	u64 flags;
+
+	struct btrfs_balance_progress stat;
+};
 
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
-#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-			   struct btrfs_device *device,
-			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret, int mirror_num);
+		    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);
@@ -184,32 +302,97 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit);
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder);
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
-int btrfs_add_device(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_device *device);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
-int btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-		      u64 logical, struct page *page);
+void btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
-void btrfs_unlock_volumes(void);
-void btrfs_lock_volumes(void);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device **device_out);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+			struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+				 struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				u64 chunk_offset, u64 chunk_size);
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+				      int index)
+{
+	atomic_inc(dev->dev_stat_values + index);
+	dev->dev_stats_dirty = 1;
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+				      int index)
+{
+	return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+						int index)
+{
+	int ret;
+
+	ret = atomic_xchg(dev->dev_stat_values + index, 0);
+	dev->dev_stats_dirty = 1;
+	return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+				      int index, unsigned long val)
+{
+	atomic_set(dev->dev_stat_values + index, val);
+	dev->dev_stats_dirty = 1;
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+					int index)
+{
+	btrfs_dev_stat_set(dev, index, 0);
+}
 #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index d779cefcfd7..ad8328d797e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -22,11 +22,13 @@
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
 #include "xattr.h"
 #include "disk-io.h"
+#include "props.h"
 
 
 ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -44,7 +46,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 		return -ENOMEM;
 
 	/* lookup the xattr by name */
-	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+	di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
 				strlen(name), 0);
 	if (!di) {
 		ret = -ENODATA;
@@ -102,48 +104,92 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	/* first lets see if we already have this xattr */
-	di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
-				strlen(name), -1);
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
-		goto out;
-	}
-
-	/* ok we already have this xattr, lets remove it */
-	if (di) {
-		/* if we want create only exit */
-		if (flags & XATTR_CREATE) {
-			ret = -EEXIST;
+	if (flags & XATTR_REPLACE) {
+		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
+					name_len, -1);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		} else if (!di) {
+			ret = -ENODATA;
 			goto out;
 		}
-
 		ret = btrfs_delete_one_dir_name(trans, root, path, di);
-		BUG_ON(ret);
-		btrfs_release_path(root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
 
-		/* if we don't have a value then we are removing the xattr */
+		/*
+		 * remove the attribute
+		 */
 		if (!value)
 			goto out;
 	} else {
-		btrfs_release_path(root, path);
-
-		if (flags & XATTR_REPLACE) {
-			/* we couldn't find the attr to replace */
-			ret = -ENODATA;
+		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+					name, name_len, 0);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
 			goto out;
 		}
+		if (!di && !value)
+			goto out;
+		btrfs_release_path(path);
 	}
 
-	/* ok we have to create a completely new xattr */
-	ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+again:
+	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
 				      name, name_len, value, size);
-	BUG_ON(ret);
+	/*
+	 * If we're setting an xattr to a new value but the new value is say
+	 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+	 * back from split_leaf.  This is because it thinks we'll be extending
+	 * the existing item size, but we're asking for enough space to add the
+	 * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
+	 * the rest of the function figure it out.
+	 */
+	if (ret == -EOVERFLOW)
+		ret = -EEXIST;
+
+	if (ret == -EEXIST) {
+		if (flags & XATTR_CREATE)
+			goto out;
+		/*
+		 * We can't use the path we already have since we won't have the
+		 * proper locking for a delete, so release the path and
+		 * re-lookup to delete the thing.
+		 */
+		btrfs_release_path(path);
+		di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+					name, name_len, -1);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		} else if (!di) {
+			/* Shouldn't happen but just in case... */
+			btrfs_release_path(path);
+			goto again;
+		}
+
+		ret = btrfs_delete_one_dir_name(trans, root, path, di);
+		if (ret)
+			goto out;
+
+		/*
+		 * We have a value to set, so go back and try to insert it now.
+		 */
+		if (value) {
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
 int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 		     struct inode *inode, const char *name,
 		     const void *value, size_t size, int flags)
@@ -158,17 +204,17 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, inode);
-
 	ret = do_setxattr(trans, inode, name, value, size, flags);
 	if (ret)
 		goto out;
 
+	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
@@ -180,18 +226,17 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
-	int ret = 0, slot, advance;
+	int ret = 0, slot;
 	size_t total_size = 0, size_left = size;
 	unsigned long name_ptr;
 	size_t name_len;
-	u32 nritems;
 
 	/*
 	 * ok we want all objects associated with this id.
 	 * NOTE: we set key.offset = 0; because we want to start with the
 	 * first xattr that we find and walk forward
 	 */
-	key.objectid = inode->i_ino;
+	key.objectid = btrfs_ino(inode);
 	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
 	key.offset = 0;
 
@@ -204,34 +249,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	advance = 0;
+
 	while (1) {
 		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
 
 		/* this is where we start walking through the path */
-		if (advance || slot >= nritems) {
+		if (slot >= btrfs_header_nritems(leaf)) {
 			/*
 			 * if we've reached the last slot in this leaf we need
 			 * to go to the next leaf and reset everything
 			 */
-			if (slot >= nritems-1) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret)
-					break;
-				leaf = path->nodes[0];
-				nritems = btrfs_header_nritems(leaf);
-				slot = path->slots[0];
-			} else {
-				/*
-				 * just walking through the slots on this leaf
-				 */
-				slot++;
-				path->slots[0]++;
-			}
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			else if (ret > 0)
+				break;
+			continue;
 		}
-		advance = 1;
 
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
@@ -242,13 +277,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			break;
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		if (verify_dir_item(root, leaf, di))
+			goto next;
 
 		name_len = btrfs_dir_name_len(leaf, di);
 		total_size += name_len + 1;
 
 		/* we are just looking for how big our buffer needs to be */
 		if (!size)
-			continue;
+			goto next;
 
 		if (!buffer || (name_len + 1) > size_left) {
 			ret = -ERANGE;
@@ -261,6 +298,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
 		size_left -= name_len + 1;
 		buffer += name_len + 1;
+next:
+		path->slots[0]++;
 	}
 	ret = total_size;
 
@@ -276,8 +315,8 @@ err:
  */
 const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	&btrfs_xattr_acl_access_handler,
-	&btrfs_xattr_acl_default_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	NULL,
 };
@@ -294,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name)
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
-	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+		!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
 }
 
 ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
@@ -336,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if (!btrfs_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
+	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		return btrfs_set_prop(dentry->d_inode, name,
+				      value, size, flags);
+
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
 
@@ -365,40 +409,44 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 	if (!btrfs_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
+	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		return btrfs_set_prop(dentry->d_inode, name,
+				      NULL, 0, XATTR_REPLACE);
+
 	return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
 				XATTR_REPLACE);
 }
 
-int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-			      struct inode *inode, struct inode *dir,
-			      const struct qstr *qstr)
+static int btrfs_initxattrs(struct inode *inode,
+			    const struct xattr *xattr_array, void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *suffix;
+	const struct xattr *xattr;
+	struct btrfs_trans_handle *trans = fs_info;
 	char *name;
+	int err = 0;
 
-	err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-					   &len);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
-	}
-
-	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
-		       GFP_NOFS);
-	if (!name) {
-		err = -ENOMEM;
-	} else {
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+			       strlen(xattr->name) + 1, GFP_NOFS);
+		if (!name) {
+			err = -ENOMEM;
+			break;
+		}
 		strcpy(name, XATTR_SECURITY_PREFIX);
-		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-		err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+		err = __btrfs_setxattr(trans, inode, name,
+				       xattr->value, xattr->value_len, 0);
 		kfree(name);
+		if (err < 0)
+			break;
 	}
-
-	kfree(suffix);
-	kfree(value);
 	return err;
 }
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &btrfs_initxattrs, trans);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b3cc8039134..5049608d138 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,8 +21,6 @@
 
 #include <linux/xattr.h>
 
-extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern const struct xattr_handler btrfs_xattr_acl_default_handler;
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index faccd47c6c4..b67d8fc8127 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,8 +97,8 @@ static int zlib_compress_pages(struct list_head *ws,
 	*total_in = 0;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-		printk(KERN_WARNING "deflateInit failed\n");
-		ret = -1;
+		printk(KERN_WARNING "BTRFS: deflateInit failed\n");
+		ret = -EIO;
 		goto out;
 	}
 
@@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,
 
 	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (out_page == NULL) {
-		ret = -1;
+		ret = -ENOMEM;
 		goto out;
 	}
 	cpage_out = kmap(out_page);
@@ -125,10 +125,10 @@ static int zlib_compress_pages(struct list_head *ws,
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
-			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
 			       ret);
 			zlib_deflateEnd(&workspace->def_strm);
-			ret = -1;
+			ret = -EIO;
 			goto out;
 		}
 
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
 		if (workspace->def_strm.total_in > 8192 &&
 		    workspace->def_strm.total_in <
 		    workspace->def_strm.total_out) {
-			ret = -1;
+			ret = -E2BIG;
 			goto out;
 		}
 		/* we need another page for writing out.  Test this
@@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,
 			kunmap(out_page);
 			if (nr_pages == nr_dest_pages) {
 				out_page = NULL;
-				ret = -1;
+				ret = -E2BIG;
 				goto out;
 			}
 			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 			if (out_page == NULL) {
-				ret = -1;
+				ret = -ENOMEM;
 				goto out;
 			}
 			cpage_out = kmap(out_page);
@@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,
 	zlib_deflateEnd(&workspace->def_strm);
 
 	if (ret != Z_STREAM_END) {
-		ret = -1;
+		ret = -EIO;
 		goto out;
 	}
 
 	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
-		ret = -1;
+		ret = -E2BIG;
 		goto out;
 	}
 
@@ -252,8 +252,8 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
-		return -1;
+		printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+		return -EIO;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 		}
 	}
 	if (ret != Z_STREAM_END)
-		ret = -1;
+		ret = -EIO;
 	else
 		ret = 0;
 done:
@@ -336,8 +336,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
-		return -1;
+		printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+		return -EIO;
 	}
 
 	while (bytes_left > 0) {
@@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 		total_out = workspace->inf_strm.total_out;
 
 		if (total_out == buf_start) {
-			ret = -1;
+			ret = -EIO;
 			break;
 		}
 
@@ -370,9 +370,9 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 			    PAGE_CACHE_SIZE - buf_offset);
 		bytes = min(bytes, bytes_left);
 
-		kaddr = kmap_atomic(dest_page, KM_USER0);
+		kaddr = kmap_atomic(dest_page);
 		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 
 		pg_offset += bytes;
 		bytes_left -= bytes;
@@ -382,7 +382,7 @@ next:
 	}
 
 	if (ret != Z_STREAM_END && bytes_left != 0)
-		ret = -1;
+		ret = -EIO;
 	else
 		ret = 0;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index da666f3148f..eba6e4f621c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -29,7 +29,7 @@
 #include <linux/file.h>
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
@@ -41,36 +41,35 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
-inline void
-init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 {
 	bh->b_end_io = handler;
 	bh->b_private = private;
 }
 EXPORT_SYMBOL(init_buffer);
 
-static int sync_buffer(void *word)
+inline void touch_buffer(struct buffer_head *bh)
 {
-	struct block_device *bd;
-	struct buffer_head *bh
-		= container_of(word, struct buffer_head, b_state);
+	trace_block_touch_buffer(bh);
+	mark_page_accessed(bh->b_page);
+}
+EXPORT_SYMBOL(touch_buffer);
 
-	smp_mb();
-	bd = bh->b_bdev;
-	if (bd)
-		blk_run_address_space(bd->bd_inode->i_mapping);
+static int sleep_on_buffer(void *word)
+{
 	io_schedule();
 	return 0;
 }
 
 void __lock_buffer(struct buffer_head *bh)
 {
-	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -78,19 +77,53 @@ EXPORT_SYMBOL(__lock_buffer);
 void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit_unlock(BH_Lock, &bh->b_state);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
 EXPORT_SYMBOL(unlock_buffer);
 
 /*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback)
+{
+	struct buffer_head *head, *bh;
+	*dirty = false;
+	*writeback = false;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		return;
+
+	if (PageWriteback(page))
+		*writeback = true;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_locked(bh))
+			*writeback = true;
+
+		if (buffer_dirty(bh))
+			*dirty = true;
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
+/*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
  * if you want to preserve its state.
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
 
@@ -194,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	int all_mapped = 1;
 
 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
-	page = find_get_page(bd_mapping, index);
+	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 	if (!page)
 		goto out;
 
@@ -220,13 +253,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
 	if (all_mapped) {
+		char b[BDEVNAME_SIZE];
+
 		printk("__find_get_block_slow() failed. "
 			"block=%llu, b_blocknr=%llu\n",
 			(unsigned long long)block,
 			(unsigned long long)bh->b_blocknr);
 		printk("b_state=0x%08lx, b_size=%zu\n",
 			bh->b_state, bh->b_size);
-		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+		printk("device %s blocksize: %d\n", bdevname(bdev, b),
+			1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
@@ -235,51 +271,6 @@ out:
 	return ret;
 }
 
-/* If invalidate_buffers() will trash dirty buffers, it means some kind
-   of fs corruption is going on. Trashing dirty data always imply losing
-   information that was supposed to be just stored on the physical layer
-   by the user.
-
-   Thus invalidate_buffers in general usage is not allwowed to trash
-   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
-   be preserved.  These buffers are simply skipped.
-  
-   We also skip buffers which are still in use.  For example this can
-   happen if a userspace program is reading the block device.
-
-   NOTE: In the case where the user removed a removable-media-disk even if
-   there's still dirty data not synced on disk (due a bug in the device driver
-   or due an error of the user), by not destroying the dirty buffers we could
-   generate corruption also on the next media inserted, thus a parameter is
-   necessary to handle this case in the most safe way possible (trying
-   to not corrupt also the new disk inserted with the data belonging to
-   the old now corrupted disk). Also for the ramdisk the natural thing
-   to do in order to release the ramdisk memory is to destroy dirty buffers.
-
-   These are two special cases. Normal usage imply the device driver
-   to issue a sync on the device (without waiting I/O completion) and
-   then an invalidate_buffers call that doesn't trash dirty buffers.
-
-   For handling cache coherency with the blkdev pagecache the 'update' case
-   is been introduced. It is needed to re-read from disk any pinned
-   buffer. NOTE: re-reading from disk is destructive so we can do it only
-   when we assume nobody is changing the buffercache under our I/O and when
-   we think the disk contains more recent information than the buffercache.
-   The update == 1 pass marks the buffers we need to update, the update == 2
-   pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev)
-{
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
-
-	if (mapping->nrpages == 0)
-		return;
-
-	invalidate_bh_lrus();
-	lru_add_drain_all();	/* make sure all lru add caches are flushed */
-	invalidate_mapping_pages(mapping, 0, -1);
-}
-EXPORT_SYMBOL(invalidate_bdev);
-
 /*
  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
  */
@@ -288,7 +279,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_flusher_threads(1024);
+	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 	yield();
 
 	for_each_online_node(nid) {
@@ -605,7 +596,7 @@ void emergency_thaw_all(void)
  */
 int sync_mapping_buffers(struct address_space *mapping)
 {
-	struct address_space *buffer_mapping = mapping->assoc_mapping;
+	struct address_space *buffer_mapping = mapping->private_data;
 
 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 		return 0;
@@ -638,10 +629,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 	struct address_space *buffer_mapping = bh->b_page->mapping;
 
 	mark_buffer_dirty(bh);
-	if (!mapping->assoc_mapping) {
-		mapping->assoc_mapping = buffer_mapping;
+	if (!mapping->private_data) {
+		mapping->private_data = buffer_mapping;
 	} else {
-		BUG_ON(mapping->assoc_mapping != buffer_mapping);
+		BUG_ON(mapping->private_data != buffer_mapping);
 	}
 	if (!bh->b_assoc_map) {
 		spin_lock(&buffer_mapping->private_lock);
@@ -663,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 static void __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
-	spin_lock_irq(&mapping->tree_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
 
@@ -749,10 +742,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping, *prev_mapping = NULL;
+	struct address_space *mapping;
 	int err = 0, err2;
+	struct blk_plug plug;
 
 	INIT_LIST_HEAD(&tmp);
+	blk_start_plug(&plug);
 
 	spin_lock(lock);
 	while (!list_empty(list)) {
@@ -775,7 +770,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * still in flight on potentially older
 				 * contents.
 				 */
-				write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+				write_dirty_buffer(bh, WRITE_SYNC);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -783,16 +778,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * wait_on_buffer() will do that for us
 				 * through sync_buffer().
 				 */
-				if (prev_mapping && prev_mapping != mapping)
-					blk_run_address_space(prev_mapping);
-				prev_mapping = mapping;
-
 				brelse(bh);
 				spin_lock(lock);
 			}
 		}
 	}
 
+	spin_unlock(lock);
+	blk_finish_plug(&plug);
+	spin_lock(lock);
+
 	while (!list_empty(&tmp)) {
 		bh = BH_ENTRY(tmp.prev);
 		get_bh(bh);
@@ -836,7 +831,7 @@ void invalidate_inode_buffers(struct inode *inode)
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
 		struct list_head *list = &mapping->private_list;
-		struct address_space *buffer_mapping = mapping->assoc_mapping;
+		struct address_space *buffer_mapping = mapping->private_data;
 
 		spin_lock(&buffer_mapping->private_lock);
 		while (!list_empty(list))
@@ -859,7 +854,7 @@ int remove_inode_buffers(struct inode *inode)
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
 		struct list_head *list = &mapping->private_list;
-		struct address_space *buffer_mapping = mapping->assoc_mapping;
+		struct address_space *buffer_mapping = mapping->private_data;
 
 		spin_lock(&buffer_mapping->private_lock);
 		while (!list_empty(list)) {
@@ -898,19 +893,14 @@ try_again:
 		if (!bh)
 			goto no_grow;
 
-		bh->b_bdev = NULL;
 		bh->b_this_page = head;
 		bh->b_blocknr = -1;
 		head = bh;
 
-		bh->b_state = 0;
-		atomic_set(&bh->b_count, 0);
 		bh->b_size = size;
 
 		/* Link the buffer to its page */
 		set_bh_page(bh, page, offset);
-
-		init_buffer(bh, NULL, NULL);
 	}
 	return head;
 /*
@@ -959,16 +949,29 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 	attach_page_buffers(page, head);
 }
 
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+	sector_t retval = ~((sector_t)0);
+	loff_t sz = i_size_read(bdev->bd_inode);
+
+	if (sz) {
+		unsigned int sizebits = blksize_bits(size);
+		retval = (sz >> sizebits);
+	}
+	return retval;
+}
+
 /*
  * Initialise the state of a blockdev page's buffers.
  */ 
-static void
+static sector_t
 init_page_buffers(struct page *page, struct block_device *bdev,
 			sector_t block, int size)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh = head;
 	int uptodate = PageUptodate(page);
+	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 
 	do {
 		if (!buffer_mapped(bh)) {
@@ -977,38 +980,57 @@ init_page_buffers(struct page *page, struct block_device *bdev,
 			bh->b_blocknr = block;
 			if (uptodate)
 				set_buffer_uptodate(bh);
-			set_buffer_mapped(bh);
+			if (block < end_block)
+				set_buffer_mapped(bh);
 		}
 		block++;
 		bh = bh->b_this_page;
 	} while (bh != head);
+
+	/*
+	 * Caller needs to validate requested block against end of device.
+	 */
+	return end_block;
 }
 
 /*
  * Create the page-cache page that contains the requested block.
  *
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
  */
-static struct page *
+static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-		pgoff_t index, int size)
+		pgoff_t index, int size, int sizebits)
 {
 	struct inode *inode = bdev->bd_inode;
 	struct page *page;
 	struct buffer_head *bh;
+	sector_t end_block;
+	int ret = 0;		/* Will call free_more_memory() */
+	gfp_t gfp_mask;
+
+	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+	gfp_mask |= __GFP_MOVABLE;
+	/*
+	 * XXX: __getblk_slow() can not really deal with failure and
+	 * will endlessly loop on improvised global reclaim.  Prefer
+	 * looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp_mask |= __GFP_NOFAIL;
 
-	page = find_or_create_page(inode->i_mapping, index,
-		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 	if (!page)
-		return NULL;
+		return ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (page_has_buffers(page)) {
 		bh = page_buffers(page);
 		if (bh->b_size == size) {
-			init_page_buffers(page, bdev, block, size);
-			return page;
+			end_block = init_page_buffers(page, bdev,
+						index << sizebits, size);
+			goto done;
 		}
 		if (!try_to_free_buffers(page))
 			goto failed;
@@ -1028,15 +1050,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	 */
 	spin_lock(&inode->i_mapping->private_lock);
 	link_dev_buffers(page, bh);
-	init_page_buffers(page, bdev, block, size);
+	end_block = init_page_buffers(page, bdev, index << sizebits, size);
 	spin_unlock(&inode->i_mapping->private_lock);
-	return page;
-
+done:
+	ret = (block < end_block) ? 1 : -ENXIO;
 failed:
-	BUG();
 	unlock_page(page);
 	page_cache_release(page);
-	return NULL;
+	return ret;
 }
 
 /*
@@ -1046,7 +1067,6 @@ failed:
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
-	struct page *page;
 	pgoff_t index;
 	int sizebits;
 
@@ -1070,14 +1090,9 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
 			bdevname(bdev, b));
 		return -EIO;
 	}
-	block = index << sizebits;
+
 	/* Create a page with the proper size buffers.. */
-	page = grow_dev_page(bdev, block, index, size);
-	if (!page)
-		return 0;
-	unlock_page(page);
-	page_cache_release(page);
-	return 1;
+	return grow_dev_page(bdev, block, index, size, sizebits);
 }
 
 static struct buffer_head *
@@ -1096,7 +1111,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 	}
 
 	for (;;) {
-		struct buffer_head * bh;
+		struct buffer_head *bh;
 		int ret;
 
 		bh = __find_get_block(bdev, block, size);
@@ -1150,6 +1165,8 @@ void mark_buffer_dirty(struct buffer_head *bh)
 {
 	WARN_ON_ONCE(!buffer_uptodate(bh));
 
+	trace_block_dirty_buffer(bh);
+
 	/*
 	 * Very *carefully* optimize the it-is-already-dirty case.
 	 *
@@ -1297,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh)
 		}
 		while (out < BH_LRU_SIZE)
 			bhs[out++] = NULL;
-		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
 	}
 	bh_lru_unlock();
 
@@ -1349,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
 
 	if (bh == NULL) {
+		/* __find_get_block_slow will mark the page accessed */
 		bh = __find_get_block_slow(bdev, block);
 		if (bh)
 			bh_lru_install(bh);
-	}
-	if (bh)
+	} else
 		touch_buffer(bh);
+
 	return bh;
 }
 EXPORT_SYMBOL(__find_get_block);
@@ -1364,10 +1382,6 @@ EXPORT_SYMBOL(__find_get_block);
  * which corresponds to the passed block_device, block and size. The
  * returned buffer has its reference count incremented.
  *
- * __getblk() cannot fail - it just keeps trying.  If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block.  Very weird.
- *
  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
  * attempt is failing.  FIXME, perhaps?
  */
@@ -1432,10 +1446,23 @@ static void invalidate_bh_lru(void *arg)
 	}
 	put_cpu_var(bh_lrus);
 }
+
+static bool has_bh_in_lru(int cpu, void *dummy)
+{
+	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+	int i;
 	
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		if (b->bhs[i])
+			return 1;
+	}
+
+	return 0;
+}
+
 void invalidate_bh_lrus(void)
 {
-	on_each_cpu(invalidate_bh_lru, NULL, 1);
+	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
@@ -1457,27 +1484,39 @@ EXPORT_SYMBOL(set_bh_page);
 /*
  * Called when truncating a buffer on a page completely.
  */
+
+/* Bits that are cleared during an invalidate */
+#define BUFFER_FLAGS_DISCARD \
+	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
+	 1 << BH_Delay | 1 << BH_Unwritten)
+
 static void discard_buffer(struct buffer_head * bh)
 {
+	unsigned long b_state, b_state_old;
+
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	bh->b_bdev = NULL;
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	clear_buffer_delay(bh);
-	clear_buffer_unwritten(bh);
+	b_state = bh->b_state;
+	for (;;) {
+		b_state_old = cmpxchg(&bh->b_state, b_state,
+				      (b_state & ~BUFFER_FLAGS_DISCARD));
+		if (b_state_old == b_state)
+			break;
+		b_state = b_state_old;
+	}
 	unlock_buffer(bh);
 }
 
 /**
- * block_invalidatepage - invalidate part of all of a buffer-backed page
+ * block_invalidatepage - invalidate part or all of a buffer-backed page
  *
  * @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
  *
  * block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
  *
  * block_invalidatepage() does not have to release all buffers, but it must
  * ensure that no dirty buffer is left outside @offset and that no I/O
@@ -1485,15 +1524,22 @@ static void discard_buffer(struct buffer_head * bh)
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+			  unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
+	unsigned int stop = length + offset;
 
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 
+	/*
+	 * Check for overflow
+	 */
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	head = page_buffers(page);
 	bh = head;
 	do {
@@ -1501,6 +1547,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
 		next = bh->b_this_page;
 
 		/*
+		 * Are we still fully in range ?
+		 */
+		if (next_off > stop)
+			goto out;
+
+		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off)
@@ -1521,6 +1573,7 @@ out:
 }
 EXPORT_SYMBOL(block_invalidatepage);
 
+
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
@@ -1589,6 +1642,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 EXPORT_SYMBOL(unmap_underlying_metadata);
 
 /*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+	return ilog2(blocksize);
+}
+
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+	return page_buffers(page);
+}
+
+/*
  * NOTE! All mapped/uptodate combinations are valid:
  *
  *	Mapped	Uptodate	Meaning
@@ -1614,14 +1689,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * prevents this contention from occurring.
  *
  * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
- * causes the writes to be flagged as synchronous writes, but the
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * causes the writes to be flagged as synchronous writes.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc,
@@ -1631,19 +1700,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	sector_t block;
 	sector_t last_block;
 	struct buffer_head *bh, *head;
-	const unsigned blocksize = 1 << inode->i_blkbits;
+	unsigned int blocksize, bbits;
 	int nr_underway = 0;
 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE);
-
-	BUG_ON(!PageLocked(page));
+			WRITE_SYNC : WRITE);
 
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, blocksize,
+	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
 
 	/*
 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
@@ -1655,9 +1718,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	 * handle that here by just cleaning them.
 	 */
 
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	head = page_buffers(page);
 	bh = head;
+	blocksize = bh->b_size;
+	bbits = block_size_bits(blocksize);
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	last_block = (i_size_read(inode) - 1) >> bbits;
 
 	/*
 	 * Get all the dirty buffers mapped to disk addresses and
@@ -1848,12 +1914,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > to);
 
-	blocksize = 1 << inode->i_blkbits;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	head = page_buffers(page);
+	head = create_page_buffers(page, inode, 0);
+	blocksize = head->b_size;
+	bbits = block_size_bits(blocksize);
 
-	bbits = inode->i_blkbits;
 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
 
 	for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1909,10 +1973,8 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
-	if (unlikely(err)) {
+	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
-		ClearPageUptodate(page);
-	}
 	return err;
 }
 EXPORT_SYMBOL(__block_write_begin);
@@ -1925,11 +1987,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 	unsigned blocksize;
 	struct buffer_head *bh, *head;
 
-	blocksize = 1 << inode->i_blkbits;
+	bh = head = page_buffers(page);
+	blocksize = bh->b_size;
 
-	for(bh = head = page_buffers(page), block_start = 0;
-	    bh != head || !block_start;
-	    block_start=block_end, bh = bh->b_this_page) {
+	block_start = 0;
+	do {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (!buffer_uptodate(bh))
@@ -1939,7 +2001,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 			mark_buffer_dirty(bh);
 		}
 		clear_buffer_new(bh);
-	}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
 
 	/*
 	 * If this is a partial write which happened to make all buffers
@@ -2061,10 +2126,9 @@ EXPORT_SYMBOL(generic_write_end);
  * Returns true if all buffers which correspond to a file portion
  * we want to read are uptodate.
  */
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
-					unsigned long from)
+int block_is_partially_uptodate(struct page *page, unsigned long from,
+					unsigned long count)
 {
-	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end, blocksize;
 	unsigned to;
 	struct buffer_head *bh, *head;
@@ -2073,13 +2137,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 	if (!page_has_buffers(page))
 		return 0;
 
-	blocksize = 1 << inode->i_blkbits;
-	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+	head = page_buffers(page);
+	blocksize = head->b_size;
+	to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
 	to = from + to;
 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
 		return 0;
 
-	head = page_buffers(page);
 	bh = head;
 	block_start = 0;
 	do {
@@ -2112,18 +2176,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 	struct inode *inode = page->mapping->host;
 	sector_t iblock, lblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-	unsigned int blocksize;
+	unsigned int blocksize, bbits;
 	int nr, i;
 	int fully_mapped = 1;
 
-	BUG_ON(!PageLocked(page));
-	blocksize = 1 << inode->i_blkbits;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	head = page_buffers(page);
+	head = create_page_buffers(page, inode, 0);
+	blocksize = head->b_size;
+	bbits = block_size_bits(blocksize);
 
-	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
 	bh = head;
 	nr = 0;
 	i = 0;
@@ -2343,24 +2405,26 @@ EXPORT_SYMBOL(block_commit_write);
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
+ *
+ * Direct callers of this function should protect against filesystem freezing
+ * using sb_start_write() - sb_end_write() functions.
  */
-int
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		   get_block_t get_block)
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+			 get_block_t get_block)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	unsigned long end;
 	loff_t size;
-	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+	int ret;
 
 	lock_page(page);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
-		/* page got truncated out from underneath us */
-		unlock_page(page);
-		goto out;
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2373,18 +2437,35 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-	if (unlikely(ret)) {
-		unlock_page(page);
-		if (ret == -ENOMEM)
-			ret = VM_FAULT_OOM;
-		else /* -ENOSPC, -EIO, etc */
-			ret = VM_FAULT_SIGBUS;
-	} else
-		ret = VM_FAULT_LOCKED;
-
-out:
+	if (unlikely(ret < 0))
+		goto out_unlock;
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+	return 0;
+out_unlock:
+	unlock_page(page);
 	return ret;
 }
+EXPORT_SYMBOL(__block_page_mkwrite);
+
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		   get_block_t get_block)
+{
+	int ret;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	sb_start_pagefault(sb);
+
+	/*
+	 * Update file times before taking page lock. We may end up failing the
+	 * fault so this update may be superfluous but who really cares...
+	 */
+	file_update_time(vma->vm_file);
+
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	sb_end_pagefault(sb);
+	return block_page_mkwrite_return(ret);
+}
 EXPORT_SYMBOL(block_page_mkwrite);
 
 /*
@@ -2810,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page);
 
 /*
  * The generic ->writepage function for buffer-backed address_spaces
- * this form passes in the end_io handler used to finish the IO.
  */
-int block_write_full_page_endio(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc, bh_end_io_t *handler)
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
@@ -2823,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
 		return __block_write_full_page(inode, page, get_block, wbc,
-					       handler);
+					       end_buffer_async_write);
 
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2833,7 +2913,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		do_invalidatepage(page, 0);
+		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		return 0; /* don't care */
 	}
@@ -2846,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-	return __block_write_full_page(inode, page, get_block, wbc, handler);
-}
-EXPORT_SYMBOL(block_write_full_page_endio);
-
-/*
- * The generic ->writepage function for buffer-backed address_spaces
- */
-int block_write_full_page(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
-{
-	return block_write_full_page_endio(page, get_block, wbc,
-					   end_buffer_async_write);
+	return __block_write_full_page(inode, page, get_block, wbc,
+							end_buffer_async_write);
 }
 EXPORT_SYMBOL(block_write_full_page);
 
@@ -2889,7 +2959,57 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-int submit_bh(int rw, struct buffer_head * bh)
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the bh block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+{
+	sector_t maxsector;
+	unsigned bytes;
+
+	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+	if (!maxsector)
+		return;
+
+	/*
+	 * If the *whole* IO is past the end of the device,
+	 * let it through, and the IO layer will turn it into
+	 * an EIO.
+	 */
+	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
+		return;
+
+	maxsector -= bio->bi_iter.bi_sector;
+	bytes = bio->bi_iter.bi_size;
+	if (likely((bytes >> 9) <= maxsector))
+		return;
+
+	/* Uhhuh. We've got a bh that straddles the device size! */
+	bytes = maxsector << 9;
+
+	/* Truncate the bio.. */
+	bio->bi_iter.bi_size = bytes;
+	bio->bi_io_vec[0].bv_len = bytes;
+
+	/* ..and clear the end of the buffer for reads */
+	if ((rw & RW_MASK) == READ) {
+		void *kaddr = kmap_atomic(bh->b_page);
+		memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+		kunmap_atomic(kaddr);
+		flush_dcache_page(bh->b_page);
+	}
+}
+
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -2912,18 +3032,26 @@ int submit_bh(int rw, struct buffer_head * bh)
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
 	bio->bi_io_vec[0].bv_len = bh->b_size;
 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 
 	bio->bi_vcnt = 1;
-	bio->bi_idx = 0;
-	bio->bi_size = bh->b_size;
+	bio->bi_iter.bi_size = bh->b_size;
 
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
+	bio->bi_flags |= bio_flags;
+
+	/* Take care of bh's that straddle the end of the device */
+	guard_bh_eod(rw, bio, bh);
+
+	if (buffer_meta(bh))
+		rw |= REQ_META;
+	if (buffer_prio(bh))
+		rw |= REQ_PRIO;
 
 	bio_get(bio);
 	submit_bio(rw, bio);
@@ -2934,6 +3062,12 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio_put(bio);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(_submit_bh);
+
+int submit_bh(int rw, struct buffer_head *bh)
+{
+	return _submit_bh(rw, bh, 0);
+}
 EXPORT_SYMBOL(submit_bh);
 
 /**
@@ -2955,7 +3089,7 @@ EXPORT_SYMBOL(submit_bh);
  * until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if approriate), unlocks the buffer and wakes
+ * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
  * any waiters. 
  *
  * All of the buffers must be for the same device, and must also be a
@@ -3138,17 +3272,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
-void block_sync_page(struct page *page)
-{
-	struct address_space *mapping;
-
-	smp_mb();
-	mapping = page_mapping(page);
-	if (mapping)
-		blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
-
 /*
  * There are no bdflush tunables left.  But distributions are
  * still running obsolete flush daemons, so we terminate them here.
@@ -3179,13 +3302,13 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data)
 /*
  * Buffer-head allocation
  */
-static struct kmem_cache *bh_cachep;
+static struct kmem_cache *bh_cachep __read_mostly;
 
 /*
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
-static int max_buffer_heads;
+static unsigned long max_buffer_heads;
 
 int buffer_heads_over_limit;
 
@@ -3301,7 +3424,7 @@ EXPORT_SYMBOL(bh_submit_read);
 
 void __init buffer_init(void)
 {
-	int nrpages;
+	unsigned long nrpages;
 
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index a2603e7c0bb..d749731dc0e 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 	       cache->brun_percent  < 100);
 
 	if (*args) {
-		kerror("'bind' command doesn't take an argument");
+		pr_err("'bind' command doesn't take an argument");
 		return -EINVAL;
 	}
 
 	if (!cache->rootdirname) {
-		kerror("No cache directory specified");
+		pr_err("No cache directory specified");
 		return -EINVAL;
 	}
 
 	/* don't permit already bound caches to be re-bound */
 	if (test_bit(CACHEFILES_READY, &cache->flags)) {
-		kerror("Cache already bound");
+		pr_err("Cache already bound");
 		return -EBUSY;
 	}
 
@@ -124,13 +124,10 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 	/* check parameters */
 	ret = -EOPNOTSUPP;
 	if (!root->d_inode ||
-	    !root->d_inode->i_op ||
 	    !root->d_inode->i_op->lookup ||
 	    !root->d_inode->i_op->mkdir ||
 	    !root->d_inode->i_op->setxattr ||
 	    !root->d_inode->i_op->getxattr ||
-	    !root->d_sb ||
-	    !root->d_sb->s_op ||
 	    !root->d_sb->s_op->statfs ||
 	    !root->d_sb->s_op->sync_fs)
 		goto error_unsupported;
@@ -231,9 +228,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 	set_bit(CACHEFILES_READY, &cache->flags);
 	dput(root);
 
-	printk(KERN_INFO "CacheFiles:"
-	       " File cache on %s registered\n",
-	       cache->cache.identifier);
+	pr_info("File cache on %s registered\n", cache->cache.identifier);
 
 	/* check how much space the cache has */
 	cachefiles_has_space(cache, 0, 0);
@@ -253,7 +248,7 @@ error_open_root:
 	kmem_cache_free(cachefiles_object_jar, fsdef);
 error_root_object:
 	cachefiles_end_secure(cache, saved_cred);
-	kerror("Failed to register: %d", ret);
+	pr_err("Failed to register: %d", ret);
 	return ret;
 }
 
@@ -265,9 +260,8 @@ void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
 	_enter("");
 
 	if (test_bit(CACHEFILES_READY, &cache->flags)) {
-		printk(KERN_INFO "CacheFiles:"
-		       " File cache on %s unregistering\n",
-		       cache->cache.identifier);
+		pr_info("File cache on %s unregistering\n",
+			cache->cache.identifier);
 
 		fscache_withdraw_cache(&cache->cache);
 	}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 0a1467b1551..b078d3081d6 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,8 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
 static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
 					 char *args)
 {
-	kerror("Free space limits must be in range"
-	       " 0%%<=stop<cull<run<100%%");
+	pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");
 
 	return -EINVAL;
 }
@@ -476,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		kerror("Empty directory specified");
+		pr_err("Empty directory specified");
 		return -EINVAL;
 	}
 
 	if (cache->rootdirname) {
-		kerror("Second cache directory specified");
+		pr_err("Second cache directory specified");
 		return -EEXIST;
 	}
 
@@ -504,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		kerror("Empty security context specified");
+		pr_err("Empty security context specified");
 		return -EINVAL;
 	}
 
 	if (cache->secctx) {
-		kerror("Second security context specified");
+		pr_err("Second security context specified");
 		return -EINVAL;
 	}
 
@@ -532,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
 	_enter(",%s", args);
 
 	if (!*args) {
-		kerror("Empty tag specified");
+		pr_err("Empty tag specified");
 		return -EINVAL;
 	}
 
@@ -563,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 		goto inval;
 
 	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-		kerror("cull applied to unready cache");
+		pr_err("cull applied to unready cache");
 		return -EIO;
 	}
 
 	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-		kerror("cull applied to dead cache");
+		pr_err("cull applied to dead cache");
 		return -EIO;
 	}
 
@@ -588,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 
 notdir:
 	path_put(&path);
-	kerror("cull command requires dirfd to be a directory");
+	pr_err("cull command requires dirfd to be a directory");
 	return -ENOTDIR;
 
 inval:
-	kerror("cull command requires dirfd and filename");
+	pr_err("cull command requires dirfd and filename");
 	return -EINVAL;
 }
 
@@ -615,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
 	return 0;
 
 inval:
-	kerror("debug command requires mask");
+	pr_err("debug command requires mask");
 	return -EINVAL;
 }
 
@@ -635,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 		goto inval;
 
 	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
-		kerror("inuse applied to unready cache");
+		pr_err("inuse applied to unready cache");
 		return -EIO;
 	}
 
 	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
-		kerror("inuse applied to dead cache");
+		pr_err("inuse applied to dead cache");
 		return -EIO;
 	}
 
@@ -660,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 
 notdir:
 	path_put(&path);
-	kerror("inuse command requires dirfd to be a directory");
+	pr_err("inuse command requires dirfd to be a directory");
 	return -ENOTDIR;
 
 inval:
-	kerror("inuse command requires dirfd and filename");
+	pr_err("inuse command requires dirfd and filename");
 	return -EINVAL;
 }
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 37fe101a4e0..584743d456c 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -11,11 +11,8 @@
 
 #include <linux/slab.h>
 #include <linux/mount.h>
-#include <linux/buffer_head.h>
 #include "internal.h"
 
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
 struct cachefiles_lookup_data {
 	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
 	char			*key;		/* key path */
@@ -42,12 +39,12 @@ static struct fscache_object *cachefiles_alloc_object(
 
 	_enter("{%s},%p,", cache->cache.identifier, cookie);
 
-	lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+	lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
 	if (!lookup_data)
 		goto nomem_lookup_data;
 
 	/* create a new object record and a temporary leaf image */
-	object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+	object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
 	if (!object)
 		goto nomem_object;
 
@@ -64,7 +61,7 @@ static struct fscache_object *cachefiles_alloc_object(
 	 * - stick the length on the front and leave space on the back for the
 	 *   encoder
 	 */
-	buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+	buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
 	if (!buffer)
 		goto nomem_buffer;
 
@@ -149,8 +146,7 @@ static int cachefiles_lookup_object(struct fscache_object *_object)
 
 	if (ret < 0 && ret != -ETIMEDOUT) {
 		if (ret != -ENOBUFS)
-			printk(KERN_WARNING
-			       "CacheFiles: Lookup failed error %d\n", ret);
+			pr_warn("Lookup failed error %d\n", ret);
 		fscache_object_lookup_error(&object->fscache);
 	}
 
@@ -197,7 +193,7 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
 }
 
 /*
- * update the auxilliary data for an object object on disk
+ * update the auxiliary data for an object object on disk
  */
 static void cachefiles_update_object(struct fscache_object *_object)
 {
@@ -213,20 +209,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
 	object = container_of(_object, struct cachefiles_object, fscache);
 	cache = container_of(object->fscache.cache, struct cachefiles_cache,
 			     cache);
+
+	if (!fscache_use_cookie(_object)) {
+		_leave(" [relinq]");
+		return;
+	}
+
 	cookie = object->fscache.cookie;
 
 	if (!cookie->def->get_aux) {
+		fscache_unuse_cookie(_object);
 		_leave(" [no aux]");
 		return;
 	}
 
-	auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+	auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
 	if (!auxdata) {
+		fscache_unuse_cookie(_object);
 		_leave(" [nomem]");
 		return;
 	}
 
 	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+	fscache_unuse_cookie(_object);
 	ASSERTCMP(auxlen, <, 511);
 
 	auxdata->len = auxlen + 1;
@@ -264,7 +269,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
 
 	/* delete retired objects */
-	if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+	if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
 	    _object != cache->cache.fsdef
 	    ) {
 		_debug("- retire object OBJ%x", object->fscache.debug_id);
@@ -372,6 +377,31 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 }
 
 /*
+ * check if the backing cache is updated to FS-Cache
+ * - called by FS-Cache when evaluates if need to invalidate the cache
+ */
+static bool cachefiles_check_consistency(struct fscache_operation *op)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("{OBJ%x}", op->object->debug_id);
+
+	object = container_of(op->object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_check_auxdata(object);
+	cachefiles_end_secure(cache, saved_cred);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * notification the attributes on an object have changed
  * - called with reads/writes excluded by FS-Cache
  */
@@ -418,14 +448,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 		_debug("discard tail %llx", oi_size);
 		newattrs.ia_valid = ATTR_SIZE;
 		newattrs.ia_size = oi_size & PAGE_MASK;
-		ret = notify_change(object->backer, &newattrs);
+		ret = notify_change(object->backer, &newattrs, NULL);
 		if (ret < 0)
 			goto truncate_failed;
 	}
 
 	newattrs.ia_valid = ATTR_SIZE;
 	newattrs.ia_size = ni_size;
-	ret = notify_change(object->backer, &newattrs);
+	ret = notify_change(object->backer, &newattrs, NULL);
 
 truncate_failed:
 	mutex_unlock(&object->backer->d_inode->i_mutex);
@@ -442,6 +472,54 @@ truncate_failed:
 }
 
 /*
+ * Invalidate an object
+ */
+static void cachefiles_invalidate_object(struct fscache_operation *op)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	struct path path;
+	uint64_t ni_size;
+	int ret;
+
+	object = container_of(op->object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
+					  &ni_size);
+
+	_enter("{OBJ%x},[%llu]",
+	       op->object->debug_id, (unsigned long long)ni_size);
+
+	if (object->backer) {
+		ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+		fscache_set_store_limit(&object->fscache, ni_size);
+
+		path.dentry = object->backer;
+		path.mnt = cache->mnt;
+
+		cachefiles_begin_secure(cache, &saved_cred);
+		ret = vfs_truncate(&path, 0);
+		if (ret == 0)
+			ret = vfs_truncate(&path, ni_size);
+		cachefiles_end_secure(cache, saved_cred);
+
+		if (ret != 0) {
+			fscache_set_store_limit(&object->fscache, 0);
+			if (ret == -EIO)
+				cachefiles_io_error_obj(object,
+							"Invalidate failed");
+		}
+	}
+
+	fscache_op_complete(op, true);
+	_leave("");
+}
+
+/*
  * dissociate a cache from all the pages it was backing
  */
 static void cachefiles_dissociate_pages(struct fscache_cache *cache)
@@ -456,6 +534,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
 	.lookup_complete	= cachefiles_lookup_complete,
 	.grab_object		= cachefiles_grab_object,
 	.update_object		= cachefiles_update_object,
+	.invalidate_object	= cachefiles_invalidate_object,
 	.drop_object		= cachefiles_drop_object,
 	.put_object		= cachefiles_put_object,
 	.sync_cache		= cachefiles_sync_cache,
@@ -467,4 +546,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
 	.write_page		= cachefiles_write_page,
 	.uncache_page		= cachefiles_uncache_page,
 	.dissociate_pages	= cachefiles_dissociate_pages,
+	.check_consistency	= cachefiles_check_consistency,
 };
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bd6bc1bde2d..3d50998abf5 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -9,6 +9,13 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "CacheFiles: " fmt
+
+
 #include <linux/fscache-cache.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
@@ -23,6 +30,8 @@ extern unsigned cachefiles_debug;
 #define CACHEFILES_DEBUG_KLEAVE	2
 #define CACHEFILES_DEBUG_KDEBUG	4
 
+#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
 /*
  * node records
  */
@@ -233,6 +242,7 @@ extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
 				       struct cachefiles_xattr *auxdata);
 extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
 					  struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_auxdata(struct cachefiles_object *object);
 extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
 					 struct cachefiles_xattr *auxdata);
 extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
@@ -242,11 +252,10 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
 /*
  * error handling
  */
-#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
 
 #define cachefiles_io_error(___cache, FMT, ...)		\
 do {							\
-	kerror("I/O Error: " FMT, ##__VA_ARGS__);	\
+	pr_err("I/O Error: " FMT, ##__VA_ARGS__);	\
 	fscache_io_error(&(___cache)->cache);		\
 	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
 } while (0)
@@ -307,8 +316,8 @@ do {							\
 #define ASSERT(X)							\
 do {									\
 	if (unlikely(!(X))) {						\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
 		BUG();							\
 	}								\
 } while (0)
@@ -316,9 +325,9 @@ do {									\
 #define ASSERTCMP(X, OP, Y)						\
 do {									\
 	if (unlikely(!((X) OP (Y)))) {					\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
-		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		pr_err("%lx " #OP " %lx is false\n",			\
 		       (unsigned long)(X), (unsigned long)(Y));		\
 		BUG();							\
 	}								\
@@ -327,8 +336,8 @@ do {									\
 #define ASSERTIF(C, X)							\
 do {									\
 	if (unlikely((C) && !(X))) {					\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
 		BUG();							\
 	}								\
 } while (0)
@@ -336,9 +345,9 @@ do {									\
 #define ASSERTIFCMP(C, X, OP, Y)					\
 do {									\
 	if (unlikely((C) && !((X) OP (Y)))) {				\
-		printk(KERN_ERR "\n");					\
-		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
-		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		pr_err("\n");						\
+		pr_err("Assertion failed\n");		\
+		pr_err("%lx " #OP " %lx is false\n",			\
 		       (unsigned long)(X), (unsigned long)(Y));		\
 		BUG();							\
 	}								\
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 81b8b2b3a67..33b58c60f2d 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
 
 	_debug("max: %d", max);
 
-	key = kmalloc(max, GFP_KERNEL);
+	key = kmalloc(max, cachefiles_gfp);
 	if (!key)
 		return NULL;
 
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 4bfa8cf43bf..180edfb45f6 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -68,8 +68,7 @@ static int __init cachefiles_init(void)
 				  SLAB_HWCACHE_ALIGN,
 				  cachefiles_object_init_once);
 	if (!cachefiles_object_jar) {
-		printk(KERN_NOTICE
-		       "CacheFiles: Failed to allocate an object jar\n");
+		pr_notice("Failed to allocate an object jar\n");
 		goto error_object_jar;
 	}
 
@@ -77,7 +76,7 @@ static int __init cachefiles_init(void)
 	if (ret < 0)
 		goto error_proc;
 
-	printk(KERN_INFO "CacheFiles: Loaded\n");
+	pr_info("Loaded\n");
 	return 0;
 
 error_proc:
@@ -85,7 +84,7 @@ error_proc:
 error_object_jar:
 	misc_deregister(&cachefiles_dev);
 error_dev:
-	kerror("failed to register: %d", ret);
+	pr_err("failed to register: %d", ret);
 	return ret;
 }
 
@@ -96,7 +95,7 @@ fs_initcall(cachefiles_init);
  */
 static void __exit cachefiles_exit(void)
 {
-	printk(KERN_INFO "CacheFiles: Unloading\n");
+	pr_info("Unloading\n");
 
 	cachefiles_proc_cleanup();
 	kmem_cache_destroy(cachefiles_object_jar);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index a0358c2189c..5bf2b41e66d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -35,44 +35,42 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 	struct fscache_cookie *cookie;
 	unsigned keylen, loop;
 
-	printk(KERN_ERR "%sobject: OBJ%x\n",
-	       prefix, object->fscache.debug_id);
-	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
-	       prefix, fscache_object_states[object->fscache.state],
+	pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
+	pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
+	       prefix, object->fscache.state->name,
 	       object->fscache.flags, work_busy(&object->fscache.work),
-	       object->fscache.events,
-	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
-	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
+	       object->fscache.events, object->fscache.event_mask);
+	pr_err("%sops=%u inp=%u exc=%u\n",
 	       prefix, object->fscache.n_ops, object->fscache.n_in_progress,
 	       object->fscache.n_exclusive);
-	printk(KERN_ERR "%sparent=%p\n",
+	pr_err("%sparent=%p\n",
 	       prefix, object->fscache.parent);
 
 	spin_lock(&object->fscache.lock);
 	cookie = object->fscache.cookie;
 	if (cookie) {
-		printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+		pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",
 		       prefix,
 		       object->fscache.cookie,
 		       object->fscache.cookie->parent,
 		       object->fscache.cookie->netfs_data,
 		       object->fscache.cookie->flags);
-		if (keybuf)
+		if (keybuf && cookie->def)
 			keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
 						      CACHEFILES_KEYBUF_SIZE);
 		else
 			keylen = 0;
 	} else {
-		printk(KERN_ERR "%scookie=NULL\n", prefix);
+		pr_err("%scookie=NULL\n", prefix);
 		keylen = 0;
 	}
 	spin_unlock(&object->fscache.lock);
 
 	if (keylen) {
-		printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
+		pr_err("%skey=[%u] '", prefix, keylen);
 		for (loop = 0; loop < keylen; loop++)
-			printk("%02x", keybuf[loop]);
-		printk("'\n");
+			pr_cont("%02x", keybuf[loop]);
+		pr_cont("'\n");
 	}
 }
 
@@ -128,17 +126,15 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
 found_dentry:
 	kdebug("preemptive burial: OBJ%x [%s] %p",
 	       object->fscache.debug_id,
-	       fscache_object_states[object->fscache.state],
+	       object->fscache.state->name,
 	       dentry);
 
-	if (object->fscache.state < FSCACHE_OBJECT_DYING) {
-		printk(KERN_ERR "\n");
-		printk(KERN_ERR "CacheFiles: Error:"
-		       " Can't preemptively bury live object\n");
+	if (fscache_object_is_live(&object->fscache)) {
+		pr_err("\n");
+		pr_err("Error: Can't preemptively bury live object\n");
 		cachefiles_printk_object(object, NULL);
 	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-		printk(KERN_ERR "CacheFiles: Error:"
-		       " Object already preemptively buried\n");
+		pr_err("Error: Object already preemptively buried\n");
 	}
 
 	write_unlock(&cache->active_lock);
@@ -161,7 +157,7 @@ try_again:
 	write_lock(&cache->active_lock);
 
 	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
-		printk(KERN_ERR "CacheFiles: Error: Object already active\n");
+		pr_err("Error: Object already active\n");
 		cachefiles_printk_object(object, NULL);
 		BUG();
 	}
@@ -193,10 +189,9 @@ try_again:
 	/* an old object from a previous incarnation is hogging the slot - we
 	 * need to wait for it to be destroyed */
 wait_for_old_object:
-	if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
-		printk(KERN_ERR "\n");
-		printk(KERN_ERR "CacheFiles: Error:"
-		       " Unexpected object collision\n");
+	if (fscache_object_is_live(&object->fscache)) {
+		pr_err("\n");
+		pr_err("Error: Unexpected object collision\n");
 		cachefiles_printk_object(object, xobject);
 		BUG();
 	}
@@ -242,9 +237,8 @@ wait_for_old_object:
 		}
 
 		if (timeout <= 0) {
-			printk(KERN_ERR "\n");
-			printk(KERN_ERR "CacheFiles: Error: Overlong"
-			       " wait for old active object to go away\n");
+			pr_err("\n");
+			pr_err("Error: Overlong wait for old active object to go away\n");
 			cachefiles_printk_object(object, xobject);
 			goto requeue;
 		}
@@ -295,7 +289,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 		if (ret < 0) {
 			cachefiles_io_error(cache, "Unlink security error");
 		} else {
-			ret = vfs_unlink(dir->d_inode, rep);
+			ret = vfs_unlink(dir->d_inode, rep, NULL);
 
 			if (preemptive)
 				cachefiles_mark_object_buried(cache, rep);
@@ -392,12 +386,12 @@ try_again:
 	path.dentry = dir;
 	path_to_graveyard.mnt = cache->mnt;
 	path_to_graveyard.dentry = cache->graveyard;
-	ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+	ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
 	if (ret < 0) {
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		ret = vfs_rename(dir->d_inode, rep,
-				 cache->graveyard->d_inode, grave);
+				 cache->graveyard->d_inode, grave, NULL, 0);
 		if (ret != 0 && ret != -ENOMEM)
 			cachefiles_io_error(cache,
 					    "Rename failed with error %d", ret);
@@ -549,7 +543,7 @@ lookup_again:
 			       next, next->d_inode, next->d_inode->i_ino);
 
 		} else if (!S_ISDIR(next->d_inode->i_mode)) {
-			kerror("inode %lu is not a directory",
+			pr_err("inode %lu is not a directory",
 			       next->d_inode->i_ino);
 			ret = -ENOBUFS;
 			goto error;
@@ -567,7 +561,7 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 			start = jiffies;
-			ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+			ret = vfs_create(dir->d_inode, next, S_IFREG, true);
 			cachefiles_hist(cachefiles_create_histogram, start);
 			if (ret < 0)
 				goto create_error;
@@ -580,7 +574,7 @@ lookup_again:
 		} else if (!S_ISDIR(next->d_inode->i_mode) &&
 			   !S_ISREG(next->d_inode->i_mode)
 			   ) {
-			kerror("inode %lu is not a file or directory",
+			pr_err("inode %lu is not a file or directory",
 			       next->d_inode->i_ino);
 			ret = -ENOBUFS;
 			goto error;
@@ -646,7 +640,8 @@ lookup_again:
 		 * (this is used to keep track of culling, and atimes are only
 		 * updated by read, write and readdir but not lookup or
 		 * open) */
-		touch_atime(cache->mnt, next);
+		path.dentry = next;
+		touch_atime(&path);
 	}
 
 	/* open a file interface onto a data file */
@@ -773,14 +768,13 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	ASSERT(subdir->d_inode);
 
 	if (!S_ISDIR(subdir->d_inode->i_mode)) {
-		kerror("%s is not a directory", dirname);
+		pr_err("%s is not a directory", dirname);
 		ret = -EIO;
 		goto check_error;
 	}
 
 	ret = -EPERM;
-	if (!subdir->d_inode->i_op ||
-	    !subdir->d_inode->i_op->setxattr ||
+	if (!subdir->d_inode->i_op->setxattr ||
 	    !subdir->d_inode->i_op->getxattr ||
 	    !subdir->d_inode->i_op->lookup ||
 	    !subdir->d_inode->i_op->mkdir ||
@@ -801,13 +795,13 @@ check_error:
 mkdir_error:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(subdir);
-	kerror("mkdir %s failed with error %d", dirname, ret);
+	pr_err("mkdir %s failed with error %d", dirname, ret);
 	return ERR_PTR(ret);
 
 lookup_error:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	ret = PTR_ERR(subdir);
-	kerror("Lookup %s failed with error %d", dirname, ret);
+	pr_err("Lookup %s failed with error %d", dirname, ret);
 	return ERR_PTR(ret);
 
 nomem_d_alloc:
@@ -836,7 +830,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
 
 	/* look up the victim */
-	mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
 
 	start = jiffies;
 	victim = lookup_one_len(filename, dir, strlen(filename));
@@ -897,7 +891,7 @@ lookup_error:
 	if (ret == -EIO) {
 		cachefiles_io_error(cache, "Lookup failed");
 	} else if (ret != -ENOMEM) {
-		kerror("Internal error: %d", ret);
+		pr_err("Internal error: %d", ret);
 		ret = -EIO;
 	}
 
@@ -956,7 +950,7 @@ error:
 	}
 
 	if (ret != -ENOMEM) {
-		kerror("Internal error: %d", ret);
+		pr_err("Internal error: %d", ret);
 		ret = -EIO;
 	}
 
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0e3c0924cc3..4b1fb5ca65b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/swap.h>
 #include "internal.h"
 
 /*
@@ -77,25 +78,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
 	struct page *backpage = monitor->back_page, *backpage2;
 	int ret;
 
-	kenter("{ino=%lx},{%lx,%lx}",
+	_enter("{ino=%lx},{%lx,%lx}",
 	       object->backer->d_inode->i_ino,
 	       backpage->index, backpage->flags);
 
 	/* skip if the page was truncated away completely */
 	if (backpage->mapping != bmapping) {
-		kleave(" = -ENODATA [mapping]");
+		_leave(" = -ENODATA [mapping]");
 		return -ENODATA;
 	}
 
 	backpage2 = find_get_page(bmapping, backpage->index);
 	if (!backpage2) {
-		kleave(" = -ENODATA [gone]");
+		_leave(" = -ENODATA [gone]");
 		return -ENODATA;
 	}
 
 	if (backpage != backpage2) {
 		put_page(backpage2);
-		kleave(" = -ENODATA [different]");
+		_leave(" = -ENODATA [different]");
 		return -ENODATA;
 	}
 
@@ -114,7 +115,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
 		if (PageUptodate(backpage))
 			goto unlock_discard;
 
-		kdebug("reissue read");
+		_debug("reissue read");
 		ret = bmapping->a_ops->readpage(NULL, backpage);
 		if (ret < 0)
 			goto unlock_discard;
@@ -129,7 +130,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
 	}
 
 	/* it'll reappear on the todo list */
-	kleave(" = -EINPROGRESS");
+	_leave(" = -EINPROGRESS");
 	return -EINPROGRESS;
 
 unlock_discard:
@@ -137,7 +138,7 @@ unlock_discard:
 	spin_lock_irq(&object->work_lock);
 	list_del(&monitor->op_link);
 	spin_unlock_irq(&object->work_lock);
-	kleave(" = %d", ret);
+	_leave(" = %d", ret);
 	return ret;
 }
 
@@ -174,11 +175,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 		_debug("- copy {%lu}", monitor->back_page->index);
 
 	recheck:
-		if (PageUptodate(monitor->back_page)) {
+		if (test_bit(FSCACHE_COOKIE_INVALIDATING,
+			     &object->fscache.cookie->flags)) {
+			error = -ESTALE;
+		} else if (PageUptodate(monitor->back_page)) {
 			copy_highpage(monitor->netfs_page, monitor->back_page);
-
-			pagevec_add(&pagevec, monitor->netfs_page);
-			fscache_mark_pages_cached(monitor->op, &pagevec);
+			fscache_mark_page_cached(monitor->op,
+						 monitor->netfs_page);
 			error = 0;
 		} else if (!PageError(monitor->back_page)) {
 			/* the page has probably been truncated */
@@ -198,6 +201,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 
 		fscache_end_io(op, monitor->netfs_page, error);
 		page_cache_release(monitor->netfs_page);
+		fscache_retrieval_complete(op, 1);
 		fscache_put_retrieval(op);
 		kfree(monitor);
 
@@ -224,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
  */
 static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 					    struct fscache_retrieval *op,
-					    struct page *netpage,
-					    struct pagevec *pagevec)
+					    struct page *netpage)
 {
 	struct cachefiles_one_read *monitor;
 	struct address_space *bmapping;
@@ -234,12 +237,10 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 
 	_enter("");
 
-	pagevec_reinit(pagevec);
-
 	_debug("read back %p{%lu,%d}",
 	       netpage, netpage->index, page_count(netpage));
 
-	monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+	monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
 	if (!monitor)
 		goto nomem;
 
@@ -258,31 +259,28 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = page_cache_alloc_cold(bmapping);
+			newpage = __page_cache_alloc(cachefiles_gfp |
+						     __GFP_COLD);
 			if (!newpage)
 				goto nomem_monitor;
 		}
 
-		ret = add_to_page_cache(newpage, bmapping,
-					netpage->index, GFP_KERNEL);
+		ret = add_to_page_cache_lru(newpage, bmapping,
+					    netpage->index, cachefiles_gfp);
 		if (ret == 0)
 			goto installed_new_backing_page;
 		if (ret != -EEXIST)
 			goto nomem_page;
 	}
 
-	/* we've installed a new backing page, so now we need to add it
-	 * to the LRU list and start it reading */
+	/* we've installed a new backing page, so now we need to start
+	 * it reading */
 installed_new_backing_page:
 	_debug("- new %p", newpage);
 
 	backpage = newpage;
 	newpage = NULL;
 
-	page_cache_get(backpage);
-	pagevec_add(pagevec, backpage);
-	__pagevec_lru_add_file(pagevec);
-
 read_backing_page:
 	ret = bmapping->a_ops->readpage(NULL, backpage);
 	if (ret < 0)
@@ -335,11 +333,11 @@ backing_page_already_present:
 backing_page_already_uptodate:
 	_debug("- uptodate");
 
-	pagevec_add(pagevec, netpage);
-	fscache_mark_pages_cached(op, pagevec);
+	fscache_mark_page_cached(op, netpage);
 
 	copy_highpage(netpage, backpage);
 	fscache_end_io(op, netpage, 0);
+	fscache_retrieval_complete(op, 1);
 
 success:
 	_debug("success");
@@ -357,10 +355,13 @@ out:
 
 read_error:
 	_debug("read error %d", ret);
-	if (ret == -ENOMEM)
+	if (ret == -ENOMEM) {
+		fscache_retrieval_complete(op, 1);
 		goto out;
+	}
 io_error:
 	cachefiles_io_error_obj(object, "Page read error on backing file");
+	fscache_retrieval_complete(op, 1);
 	ret = -ENOBUFS;
 	goto out;
 
@@ -370,6 +371,7 @@ nomem_monitor:
 	fscache_put_retrieval(monitor->op);
 	kfree(monitor);
 nomem:
+	fscache_retrieval_complete(op, 1);
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
 }
@@ -408,7 +410,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	_enter("{%p},{%lx},,,", object, page->index);
 
 	if (!object->backer)
-		return -ENOBUFS;
+		goto enobufs;
 
 	inode = object->backer->d_inode;
 	ASSERT(S_ISREG(inode->i_mode));
@@ -417,7 +419,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 
 	/* calculate the shift required to use bmap */
 	if (inode->i_sb->s_blocksize > PAGE_SIZE)
-		return -ENOBUFS;
+		goto enobufs;
 
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
@@ -444,19 +446,23 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	if (block) {
 		/* submit the apparently valid page to the backing fs to be
 		 * read from disk */
-		ret = cachefiles_read_backing_file_one(object, op, page,
-						       &pagevec);
+		ret = cachefiles_read_backing_file_one(object, op, page);
 	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
 		/* there's space in the cache we can use */
-		pagevec_add(&pagevec, page);
-		fscache_mark_pages_cached(op, &pagevec);
+		fscache_mark_page_cached(op, page);
+		fscache_retrieval_complete(op, 1);
 		ret = -ENODATA;
 	} else {
-		ret = -ENOBUFS;
+		goto enobufs;
 	}
 
 	_leave(" = %d", ret);
 	return ret;
+
+enobufs:
+	fscache_retrieval_complete(op, 1);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
 }
 
 /*
@@ -465,19 +471,15 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
  */
 static int cachefiles_read_backing_file(struct cachefiles_object *object,
 					struct fscache_retrieval *op,
-					struct list_head *list,
-					struct pagevec *mark_pvec)
+					struct list_head *list)
 {
 	struct cachefiles_one_read *monitor = NULL;
 	struct address_space *bmapping = object->backer->d_inode->i_mapping;
-	struct pagevec lru_pvec;
 	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
 	int ret = 0;
 
 	_enter("");
 
-	pagevec_init(&lru_pvec, 0);
-
 	list_for_each_entry_safe(netpage, _n, list, lru) {
 		list_del(&netpage->lru);
 
@@ -485,7 +487,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 		       netpage, netpage->index, page_count(netpage));
 
 		if (!monitor) {
-			monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+			monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
 			if (!monitor)
 				goto nomem;
 
@@ -500,31 +502,29 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = page_cache_alloc_cold(bmapping);
+				newpage = __page_cache_alloc(cachefiles_gfp |
+							     __GFP_COLD);
 				if (!newpage)
 					goto nomem;
 			}
 
-			ret = add_to_page_cache(newpage, bmapping,
-						netpage->index, GFP_KERNEL);
+			ret = add_to_page_cache_lru(newpage, bmapping,
+						    netpage->index,
+						    cachefiles_gfp);
 			if (ret == 0)
 				goto installed_new_backing_page;
 			if (ret != -EEXIST)
 				goto nomem;
 		}
 
-		/* we've installed a new backing page, so now we need to add it
-		 * to the LRU list and start it reading */
+		/* we've installed a new backing page, so now we need
+		 * to start it reading */
 	installed_new_backing_page:
 		_debug("- new %p", newpage);
 
 		backpage = newpage;
 		newpage = NULL;
 
-		page_cache_get(backpage);
-		if (!pagevec_add(&lru_pvec, backpage))
-			__pagevec_lru_add_file(&lru_pvec);
-
 	reread_backing_page:
 		ret = bmapping->a_ops->readpage(NULL, backpage);
 		if (ret < 0)
@@ -535,20 +535,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 	monitor_backing_page:
 		_debug("- monitor add");
 
-		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-					GFP_KERNEL);
+		ret = add_to_page_cache_lru(netpage, op->mapping,
+					    netpage->index, cachefiles_gfp);
 		if (ret < 0) {
 			if (ret == -EEXIST) {
 				page_cache_release(netpage);
+				fscache_retrieval_complete(op, 1);
 				continue;
 			}
 			goto nomem;
 		}
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
-
 		/* install a monitor */
 		page_cache_get(netpage);
 		monitor->netfs_page = netpage;
@@ -611,11 +608,12 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 	backing_page_already_uptodate:
 		_debug("- uptodate");
 
-		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-					GFP_KERNEL);
+		ret = add_to_page_cache_lru(netpage, op->mapping,
+					    netpage->index, cachefiles_gfp);
 		if (ret < 0) {
 			if (ret == -EEXIST) {
 				page_cache_release(netpage);
+				fscache_retrieval_complete(op, 1);
 				continue;
 			}
 			goto nomem;
@@ -626,16 +624,13 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 		page_cache_release(backpage);
 		backpage = NULL;
 
-		if (!pagevec_add(mark_pvec, netpage))
-			fscache_mark_pages_cached(op, mark_pvec);
-
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		fscache_mark_page_cached(op, netpage);
 
+		/* the netpage is unlocked and marked up to date here */
 		fscache_end_io(op, netpage, 0);
 		page_cache_release(netpage);
 		netpage = NULL;
+		fscache_retrieval_complete(op, 1);
 		continue;
 	}
 
@@ -645,8 +640,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 out:
 	/* tidy up */
-	pagevec_lru_add_file(&lru_pvec);
-
 	if (newpage)
 		page_cache_release(newpage);
 	if (netpage)
@@ -661,6 +654,7 @@ out:
 	list_for_each_entry_safe(netpage, _n, list, lru) {
 		list_del(&netpage->lru);
 		page_cache_release(netpage);
+		fscache_retrieval_complete(op, 1);
 	}
 
 	_leave(" = %d", ret);
@@ -669,15 +663,17 @@ out:
 nomem:
 	_debug("nomem");
 	ret = -ENOMEM;
-	goto out;
+	goto record_page_complete;
 
 read_error:
 	_debug("read error %d", ret);
 	if (ret == -ENOMEM)
-		goto out;
+		goto record_page_complete;
 io_error:
 	cachefiles_io_error_obj(object, "Page read error on backing file");
 	ret = -ENOBUFS;
+record_page_complete:
+	fscache_retrieval_complete(op, 1);
 	goto out;
 }
 
@@ -709,7 +705,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	       *nr_pages);
 
 	if (!object->backer)
-		return -ENOBUFS;
+		goto all_enobufs;
 
 	space = 1;
 	if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
@@ -722,7 +718,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 
 	/* calculate the shift required to use bmap */
 	if (inode->i_sb->s_blocksize > PAGE_SIZE)
-		return -ENOBUFS;
+		goto all_enobufs;
 
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
@@ -762,7 +758,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 			nrbackpages++;
 		} else if (space && pagevec_add(&pagevec, page) == 0) {
 			fscache_mark_pages_cached(op, &pagevec);
+			fscache_retrieval_complete(op, 1);
 			ret = -ENODATA;
+		} else {
+			fscache_retrieval_complete(op, 1);
 		}
 	}
 
@@ -775,18 +774,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	/* submit the apparently valid pages to the backing fs to be read from
 	 * disk */
 	if (nrbackpages > 0) {
-		ret2 = cachefiles_read_backing_file(object, op, &backpages,
-						    &pagevec);
+		ret2 = cachefiles_read_backing_file(object, op, &backpages);
 		if (ret2 == -ENOMEM || ret2 == -EINTR)
 			ret = ret2;
 	}
 
-	if (pagevec_count(&pagevec) > 0)
-		fscache_mark_pages_cached(op, &pagevec);
-
 	_leave(" = %d [nr=%u%s]",
 	       ret, *nr_pages, list_empty(pages) ? " empty" : "");
 	return ret;
+
+all_enobufs:
+	fscache_retrieval_complete(op, *nr_pages);
+	return -ENOBUFS;
 }
 
 /*
@@ -806,7 +805,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
 {
 	struct cachefiles_object *object;
 	struct cachefiles_cache *cache;
-	struct pagevec pagevec;
 	int ret;
 
 	object = container_of(op->op.object,
@@ -817,14 +815,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
 	_enter("%p,{%lx},", object, page->index);
 
 	ret = cachefiles_has_space(cache, 0, 1);
-	if (ret == 0) {
-		pagevec_init(&pagevec, 0);
-		pagevec_add(&pagevec, page);
-		fscache_mark_pages_cached(op, &pagevec);
-	} else {
+	if (ret == 0)
+		fscache_mark_page_cached(op, page);
+	else
 		ret = -ENOBUFS;
-	}
 
+	fscache_retrieval_complete(op, 1);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -874,6 +870,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
 		ret = -ENOBUFS;
 	}
 
+	fscache_retrieval_complete(op, *nr_pages);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -891,6 +888,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 	struct cachefiles_cache *cache;
 	mm_segment_t old_fs;
 	struct file *file;
+	struct path path;
 	loff_t pos, eof;
 	size_t len;
 	void *data;
@@ -916,10 +914,9 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 
 	/* write the page to the backing filesystem and let it store it in its
 	 * own time */
-	dget(object->backer);
-	mntget(cache->mnt);
-	file = dentry_open(object->backer, cache->mnt, O_RDWR,
-			   cache->cache_cred);
+	path.mnt = cache->mnt;
+	path.dentry = object->backer;
+	file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
 	} else {
@@ -942,12 +939,14 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 			}
 
 			data = kmap(page);
+			file_start_write(file);
 			old_fs = get_fs();
 			set_fs(KERNEL_DS);
 			ret = file->f_op->write(
 				file, (const void __user *) data, len, &pos);
 			set_fs(old_fs);
 			kunmap(page);
+			file_end_write(file);
 			if (ret != len)
 				ret = -EIO;
 		}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index 039b5011d83..396c18ea276 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -34,9 +34,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 		ret = set_security_override_from_ctx(new, cache->secctx);
 		if (ret < 0) {
 			put_cred(new);
-			printk(KERN_ERR "CacheFiles:"
-			       " Security denies permission to nominate"
-			       " security context: error %d\n",
+			pr_err("Security denies permission to nominate security context: error %d\n",
 			       ret);
 			goto error;
 		}
@@ -59,16 +57,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 
 	ret = security_inode_mkdir(root->d_inode, root, 0);
 	if (ret < 0) {
-		printk(KERN_ERR "CacheFiles:"
-		       " Security denies permission to make dirs: error %d",
+		pr_err("Security denies permission to make dirs: error %d",
 		       ret);
 		return ret;
 	}
 
 	ret = security_inode_create(root->d_inode, root, 0);
 	if (ret < 0)
-		printk(KERN_ERR "CacheFiles:"
-		       " Security denies permission to create files: error %d",
+		pr_err("Security denies permission to create files: error %d",
 		       ret);
 
 	return ret;
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index e18b183b47e..1ad51ffbb27 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 	}
 
 	if (ret != -EEXIST) {
-		kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+		pr_err("Can't set xattr on %*.*s [%lu] (err %d)",
 		       dentry->d_name.len, dentry->d_name.len,
 		       dentry->d_name.name, dentry->d_inode->i_ino,
 		       -ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 		if (ret == -ERANGE)
 			goto bad_type_length;
 
-		kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+		pr_err("Can't read xattr on %*.*s [%lu] (err %d)",
 		       dentry->d_name.len, dentry->d_name.len,
 		       dentry->d_name.name, dentry->d_inode->i_ino,
 		       -ret);
@@ -85,14 +85,14 @@ error:
 	return ret;
 
 bad_type_length:
-	kerror("Cache object %lu type xattr length incorrect",
+	pr_err("Cache object %lu type xattr length incorrect",
 	       dentry->d_inode->i_ino);
 	ret = -EIO;
 	goto error;
 
 bad_type:
 	xtype[2] = 0;
-	kerror("Cache object %*.*s [%lu] type %s not %s",
+	pr_err("Cache object %*.*s [%lu] type %s not %s",
 	       dentry->d_name.len, dentry->d_name.len,
 	       dentry->d_name.name, dentry->d_inode->i_ino,
 	       xtype, type);
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(object->fscache.cookie);
 	ASSERT(dentry);
 
 	_enter("%p,#%d", object, auxdata->len);
 
 	/* attempt to install the cache metadata directly */
-	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+	_debug("SET #%u", auxdata->len);
 
 	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
 			   &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(object->fscache.cookie);
 	ASSERT(dentry);
 
 	_enter("%p,#%d", object, auxdata->len);
 
 	/* attempt to install the cache metadata directly */
-	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+	_debug("SET #%u", auxdata->len);
 
 	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
 			   &auxdata->type, auxdata->len,
@@ -159,6 +157,43 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
 }
 
 /*
+ * check the consistency between the backing cache and the FS-Cache cookie
+ */
+int cachefiles_check_auxdata(struct cachefiles_object *object)
+{
+	struct cachefiles_xattr *auxbuf;
+	enum fscache_checkaux validity;
+	struct dentry *dentry = object->dentry;
+	ssize_t xlen;
+	int ret;
+
+	ASSERT(dentry);
+	ASSERT(dentry->d_inode);
+	ASSERT(object->fscache.cookie->def->check_aux);
+
+	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+	if (!auxbuf)
+		return -ENOMEM;
+
+	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
+			    &auxbuf->type, 512 + 1);
+	ret = -ESTALE;
+	if (xlen < 1 ||
+	    auxbuf->type != object->fscache.cookie->def->type)
+		goto error;
+
+	xlen--;
+	validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
+	if (validity != FSCACHE_CHECKAUX_OKAY)
+		goto error;
+
+	ret = 0;
+error:
+	kfree(auxbuf);
+	return ret;
+}
+
+/*
  * check the state xattr on a cache file
  * - return -ESTALE if the object should be deleted
  */
@@ -174,7 +209,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
 	ASSERT(dentry);
 	ASSERT(dentry->d_inode);
 
-	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
 	if (!auxbuf) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
@@ -258,7 +293,7 @@ error:
 	return ret;
 
 bad_type_length:
-	kerror("Cache object %lu xattr length incorrect",
+	pr_err("Cache object %lu xattr length incorrect",
 	       dentry->d_inode->i_ino);
 	ret = -EIO;
 	goto error;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 9eb134ea6eb..264e9bf83ff 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,6 +1,6 @@
 config CEPH_FS
-        tristate "Ceph distributed file system (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
+	tristate "Ceph distributed file system"
+	depends on INET
 	select CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
@@ -16,3 +16,25 @@ config CEPH_FS
 
 	  If unsure, say N.
 
+if CEPH_FS
+config CEPH_FSCACHE
+	bool "Enable Ceph client caching support"
+	depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
+	help
+	  Choose Y here to enable persistent, read-only local
+	  caching support for Ceph clients using FS-Cache
+
+endif
+
+config CEPH_FS_POSIX_ACL
+	bool "Ceph POSIX Access Control Lists"
+	depends on CEPH_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index bd352125e82..85a4230b9bf 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -9,3 +9,5 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	mds_client.o mdsmap.o strings.o ceph_frag.o \
 	debugfs.o
 
+ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 00000000000..469f2e8657e
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,194 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+					int type, struct posix_acl *acl)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+		set_cached_acl(inode, type, acl);
+	spin_unlock(&ci->i_ceph_lock);
+}
+
+static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
+							int type)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct posix_acl *acl = ACL_NOT_CACHED;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+		acl = get_cached_acl(inode, type);
+	spin_unlock(&ci->i_ceph_lock);
+
+	return acl;
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+	int size;
+	const char *name;
+	char *value = NULL;
+	struct posix_acl *acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+
+	size = __ceph_getxattr(inode, name, "", 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __ceph_getxattr(inode, name, value, size);
+	}
+
+	if (size > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	else if (size == -ERANGE || size == -ENODATA || size == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(-EIO);
+
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		ceph_set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret = 0, size = 0;
+	const char *name = NULL;
+	char *value = NULL;
+	struct iattr newattrs;
+	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+	struct dentry *dentry;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &new_mode);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode)) {
+			ret = acl ? -EINVAL : 0;
+			goto out;
+		}
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (ret < 0)
+			goto out_free;
+	}
+
+	dentry = d_find_alias(inode);
+	if (new_mode != old_mode) {
+		newattrs.ia_mode = new_mode;
+		newattrs.ia_valid = ATTR_MODE;
+		ret = ceph_setattr(dentry, &newattrs);
+		if (ret)
+			goto out_dput;
+	}
+
+	ret = __ceph_setxattr(dentry, name, value, size, 0);
+	if (ret) {
+		if (new_mode != old_mode) {
+			newattrs.ia_mode = old_mode;
+			newattrs.ia_valid = ATTR_MODE;
+			ceph_setattr(dentry, &newattrs);
+		}
+		goto out_dput;
+	}
+
+	ceph_set_cached_acl(inode, type, acl);
+
+out_dput:
+	dput(dentry);
+out_free:
+	kfree(value);
+out:
+	return ret;
+}
+
+int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int error;
+
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (!default_acl && !acl)
+		cache_no_acl(inode);
+
+	if (default_acl) {
+		error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+	return error;
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 561438b6a50..90b3954d48e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -11,6 +11,7 @@
 
 #include "super.h"
 #include "mds_client.h"
+#include "cache.h"
 #include <linux/ceph/osd_client.h>
 
 /*
@@ -24,7 +25,7 @@
  * context needs to be associated with the osd write during writeback.
  *
  * Similarly, struct ceph_inode_info maintains a set of counters to
- * count dirty pages on the inode.  In the absense of snapshots,
+ * count dirty pages on the inode.  In the absence of snapshots,
  * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
  *
  * When a snapshot is taken (that is, when the client receives
@@ -54,7 +55,12 @@
 	(CONGESTION_ON_THRESH(congestion_kb) -				\
 	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
 
-
+static inline struct ceph_snap_context *page_snap_context(struct page *page)
+{
+	if (PagePrivate(page))
+		return (void *)page->private;
+	return NULL;
+}
 
 /*
  * Dirty a page.  Optimistically adjust accounting, on the assumption
@@ -65,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page)
 	struct address_space *mapping = page->mapping;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	int undo = 0;
 	struct ceph_snap_context *snapc;
+	int ret;
 
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
 
-	if (TestSetPageDirty(page)) {
+	if (PageDirty(page)) {
 		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
 		     mapping->host, page, page->index);
+		BUG_ON(!PagePrivate(page));
 		return 0;
 	}
 
@@ -87,12 +94,12 @@ static int ceph_set_page_dirty(struct page *page)
 	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
 
 	/* dirty the head */
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_head_snapc == NULL)
 		ci->i_head_snapc = ceph_get_snap_context(snapc);
 	++ci->i_wrbuffer_ref_head;
 	if (ci->i_wrbuffer_ref == 0)
-		igrab(inode);
+		ihold(inode);
 	++ci->i_wrbuffer_ref;
 	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
 	     "snapc %p seq %lld (%d snaps)\n",
@@ -100,37 +107,21 @@ static int ceph_set_page_dirty(struct page *page)
 	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
 	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
 	     snapc, snapc->seq, snapc->num_snaps);
-	spin_unlock(&inode->i_lock);
-
-	/* now adjust page */
-	spin_lock_irq(&mapping->tree_lock);
-	if (page->mapping) {	/* Race with truncate? */
-		WARN_ON_ONCE(!PageUptodate(page));
-		account_page_dirtied(page, page->mapping);
-		radix_tree_tag_set(&mapping->page_tree,
-				page_index(page), PAGECACHE_TAG_DIRTY);
-
-		/*
-		 * Reference snap context in page->private.  Also set
-		 * PagePrivate so that we get invalidatepage callback.
-		 */
-		page->private = (unsigned long)snapc;
-		SetPagePrivate(page);
-	} else {
-		dout("ANON set_page_dirty %p (raced truncate?)\n", page);
-		undo = 1;
-	}
-
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
-	if (undo)
-		/* whoops, we failed to dirty the page */
-		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	/*
+	 * Reference snap context in page->private.  Also set
+	 * PagePrivate so that we get invalidatepage callback.
+	 */
+	BUG_ON(PagePrivate(page));
+	page->private = (unsigned long)snapc;
+	SetPagePrivate(page);
 
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	ret = __set_page_dirty_nobuffers(page);
+	WARN_ON(!PageLocked(page));
+	WARN_ON(!page->mapping);
 
-	BUG_ON(!PageDirty(page));
-	return 1;
+	return ret;
 }
 
 /*
@@ -138,18 +129,26 @@ static int ceph_set_page_dirty(struct page *page)
  * dirty page counters appropriately.  Only called if there is private
  * data on the page.
  */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	struct ceph_snap_context *snapc = (void *)page->private;
-
-	BUG_ON(!PageLocked(page));
-	BUG_ON(!page->private);
-	BUG_ON(!PagePrivate(page));
-	BUG_ON(!page->mapping);
+	struct ceph_snap_context *snapc = page_snap_context(page);
 
 	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+
+	if (offset != 0 || length != PAGE_CACHE_SIZE) {
+		dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
+		     inode, page, page->index, offset, length);
+		return;
+	}
+
+	ceph_invalidate_fscache_page(inode, page);
+
+	if (!PagePrivate(page))
+		return;
 
 	/*
 	 * We can get non-dirty pages here due to races between
@@ -159,32 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
 	if (!PageDirty(page))
 		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
 
-	if (offset == 0)
-		ClearPageChecked(page);
+	ClearPageChecked(page);
 
-	ci = ceph_inode(inode);
-	if (offset == 0) {
-		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
-		     inode, page, page->index, offset);
-		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-		ceph_put_snap_context(snapc);
-		page->private = 0;
-		ClearPagePrivate(page);
-	} else {
-		dout("%p invalidatepage %p idx %lu partial dirty page\n",
-		     inode, page, page->index);
-	}
+	dout("%p invalidatepage %p idx %lu full dirty page\n",
+	     inode, page, page->index);
+
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);
+	page->private = 0;
+	ClearPagePrivate(page);
 }
 
-/* just a sanity check */
 static int ceph_releasepage(struct page *page, gfp_t g)
 {
 	struct inode *inode = page->mapping ? page->mapping->host : NULL;
 	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
 	WARN_ON(PageDirty(page));
-	WARN_ON(page->private);
-	WARN_ON(PagePrivate(page));
-	return 0;
+
+	/* Can we release the page from the cache? */
+	if (!ceph_release_fscache_page(page, g))
+		return 0;
+
+	return !PagePrivate(page);
 }
 
 /*
@@ -192,29 +187,39 @@ static int ceph_releasepage(struct page *page, gfp_t g)
  */
 static int readpage_nounlock(struct file *filp, struct page *page)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = 
+	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
 	int err = 0;
 	u64 len = PAGE_CACHE_SIZE;
 
+	err = ceph_readpage_from_fscache(inode, page);
+
+	if (err == 0)
+		goto out;
+
 	dout("readpage inode %p file %p page %p index %lu\n",
 	     inode, filp, page, page->index);
 	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-				  page->index << PAGE_CACHE_SHIFT, &len,
+				  (u64) page_offset(page), &len,
 				  ci->i_truncate_seq, ci->i_truncate_size,
 				  &page, 1, 0);
 	if (err == -ENOENT)
 		err = 0;
 	if (err < 0) {
 		SetPageError(page);
+		ceph_fscache_readpage_cancel(inode, page);
 		goto out;
-	} else if (err < PAGE_CACHE_SIZE) {
+	}
+	if (err < PAGE_CACHE_SIZE)
 		/* zero fill remainder of page */
 		zero_user_segment(page, err, PAGE_CACHE_SIZE);
-	}
+	else
+		flush_dcache_page(page);
+
 	SetPageUptodate(page);
+	ceph_readpage_to_fscache(inode, page);
 
 out:
 	return err < 0 ? err : 0;
@@ -228,102 +233,180 @@ static int ceph_readpage(struct file *filp, struct page *page)
 }
 
 /*
- * Build a vector of contiguous pages from the provided page list.
+ * Finish an async read(ahead) op.
  */
-static struct page **page_vector_from_list(struct list_head *page_list,
-					   unsigned *nr_pages)
+static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
-	struct page **pages;
-	struct page *page;
-	int next_index, contig_pages = 0;
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_data *osd_data;
+	int rc = req->r_result;
+	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int num_pages;
+	int i;
 
-	/* build page vector */
-	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
+	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
-	BUG_ON(list_empty(page_list));
-	next_index = list_entry(page_list->prev, struct page, lru)->index;
-	list_for_each_entry_reverse(page, page_list, lru) {
-		if (page->index == next_index) {
-			dout("readpages page %d %p\n", contig_pages, page);
-			pages[contig_pages] = page;
-			contig_pages++;
-			next_index++;
-		} else {
-			break;
+	/* unlock all pages, zeroing any data we didn't read */
+	osd_data = osd_req_op_extent_osd_data(req, 0);
+	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)osd_data->alignment,
+					(u64)osd_data->length);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = osd_data->pages[i];
+
+		if (rc < 0)
+			goto unlock;
+		if (bytes < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = bytes < 0 ? 0 : bytes;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
 		}
+ 		dout("finish_read %p uptodate %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		ceph_readpage_to_fscache(inode, page);
+unlock:
+		unlock_page(page);
+		page_cache_release(page);
+		bytes -= PAGE_CACHE_SIZE;
 	}
-	*nr_pages = contig_pages;
-	return pages;
+	kfree(osd_data->pages);
+}
+
+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		unlock_page(pages[i]);
 }
 
 /*
- * Read multiple pages.  Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
+ * start an async read(ahead) operation.  return nr_pages we submitted
+ * a read for on success, or negative error code.
  */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
-			  struct list_head *page_list, unsigned nr_pages)
+static int start_read(struct inode *inode, struct list_head *page_list, int max)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
-	int rc = 0;
-	struct page **pages;
-	loff_t offset;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct page *page = list_entry(page_list->prev, struct page, lru);
+	struct ceph_vino vino;
+	struct ceph_osd_request *req;
+	u64 off;
 	u64 len;
+	int i;
+	struct page **pages;
+	pgoff_t next_index;
+	int nr_pages = 0;
+	int ret;
 
-	dout("readpages %p file %p nr_pages %d\n",
-	     inode, file, nr_pages);
-
-	pages = page_vector_from_list(page_list, &nr_pages);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
+	off = (u64) page_offset(page);
 
-	/* guess read extent */
-	offset = pages[0]->index << PAGE_CACHE_SHIFT;
+	/* count pages */
+	next_index = page->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index != next_index)
+			break;
+		nr_pages++;
+		next_index++;
+		if (max && nr_pages == max)
+			break;
+	}
 	len = nr_pages << PAGE_CACHE_SHIFT;
-	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-				 offset, &len,
-				 ci->i_truncate_seq, ci->i_truncate_size,
-				 pages, nr_pages, 0);
-	if (rc == -ENOENT)
-		rc = 0;
-	if (rc < 0)
-		goto out;
-
-	for (; !list_empty(page_list) && len > 0;
-	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
-		struct page *page =
-			list_entry(page_list->prev, struct page, lru);
+	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
+	     off, len);
+	vino = ceph_vino(inode);
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+				    1, CEPH_OSD_OP_READ,
+				    CEPH_OSD_FLAG_READ, NULL,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
+	/* build page vector */
+	nr_pages = calc_pages_for(0, len);
+	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+	ret = -ENOMEM;
+	if (!pages)
+		goto out;
+	for (i = 0; i < nr_pages; ++i) {
+		page = list_entry(page_list->prev, struct page, lru);
+		BUG_ON(PageLocked(page));
 		list_del(&page->lru);
 
-		if (rc < (int)PAGE_CACHE_SIZE) {
-			/* zero (remainder of) page */
-			int s = rc < 0 ? 0 : rc;
-			zero_user_segment(page, s, PAGE_CACHE_SIZE);
-		}
-
-		if (add_to_page_cache_lru(page, mapping, page->index,
+ 		dout("start_read %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
 					  GFP_NOFS)) {
+			ceph_fscache_uncache_page(inode, page);
 			page_cache_release(page);
-			dout("readpages %p add_to_page_cache failed %p\n",
+			dout("start_read %p add_to_page_cache failed %p\n",
 			     inode, page);
-			continue;
+			nr_pages = i;
+			goto out_pages;
 		}
-		dout("readpages %p adding %p idx %lu\n", inode, page,
-		     page->index);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-		unlock_page(page);
-		page_cache_release(page);
+		pages[i] = page;
 	}
-	rc = 0;
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
+	req->r_callback = finish_read;
+	req->r_inode = inode;
+
+	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+
+	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
+	ret = ceph_osdc_start_request(osdc, req, false);
+	if (ret < 0)
+		goto out_pages;
+	ceph_osdc_put_request(req);
+	return nr_pages;
 
+out_pages:
+	ceph_unlock_page_vector(pages, nr_pages);
+	ceph_release_page_vector(pages, nr_pages);
 out:
-	kfree(pages);
+	ceph_osdc_put_request(req);
+	return ret;
+}
+
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	int rc = 0;
+	int max = 0;
+
+	rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
+					 &nr_pages);
+
+	if (rc == 0)
+		goto out;
+
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+
+	dout("readpages %p file %p nr_pages %d max %d\n", inode,
+		file, nr_pages,
+	     max);
+	while (!list_empty(page_list)) {
+		rc = start_read(inode, page_list, max);
+		if (rc < 0)
+			goto out;
+		BUG_ON(rc == 0);
+	}
+out:
+	ceph_fscache_readpages_cancel(inode, page_list);
+
+	dout("readpages %p file %p ret %d\n", inode, file, rc);
 	return rc;
 }
 
@@ -338,7 +421,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 	struct ceph_snap_context *snapc = NULL;
 	struct ceph_cap_snap *capsnap = NULL;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
 		     capsnap->context, capsnap->dirty_pages);
@@ -354,7 +437,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return snapc;
 }
 
@@ -370,13 +453,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	struct ceph_inode_info *ci;
 	struct ceph_fs_client *fsc;
 	struct ceph_osd_client *osdc;
-	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
-	int len = PAGE_CACHE_SIZE;
-	loff_t i_size;
-	int err = 0;
 	struct ceph_snap_context *snapc, *oldest;
-	u64 snap_size = 0;
+	loff_t page_off = page_offset(page);
 	long writeback_stat;
+	u64 truncate_size, snap_size = 0;
+	u32 truncate_seq;
+	int err = 0, len = PAGE_CACHE_SIZE;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
@@ -390,7 +472,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	osdc = &fsc->client->osdc;
 
 	/* verify this is a writeable snap context */
-	snapc = (void *)page->private;
+	snapc = page_snap_context(page);
 	if (snapc == NULL) {
 		dout("writepage %p page %p not dirty?\n", inode, page);
 		goto out;
@@ -398,7 +480,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	oldest = get_oldest_context(inode, &snap_size);
 	if (snapc->seq > oldest->seq) {
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
-		     inode, page, (void *)page->private);
+		     inode, page, snapc);
 		/* we should only noop if called by kswapd */
 		WARN_ON((current->flags & PF_MEMALLOC) == 0);
 		ceph_put_snap_context(oldest);
@@ -406,13 +488,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	ceph_put_snap_context(oldest);
 
+	spin_lock(&ci->i_ceph_lock);
+	truncate_seq = ci->i_truncate_seq;
+	truncate_size = ci->i_truncate_size;
+	if (!snap_size)
+		snap_size = i_size_read(inode);
+	spin_unlock(&ci->i_ceph_lock);
+
 	/* is this a partial page at end of file? */
-	if (snap_size)
-		i_size = snap_size;
-	else
-		i_size = i_size_read(inode);
-	if (i_size < page_off + len)
-		len = i_size - page_off;
+	if (page_off >= snap_size) {
+		dout("%p page eof %llu\n", page, snap_size);
+		goto out;
+	}
+	if (snap_size < page_off + len)
+		len = snap_size - page_off;
 
 	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
 	     inode, page, page->index, page_off, len, snapc);
@@ -422,13 +511,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
 		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
 
+	ceph_readpage_to_fscache(inode, page);
+
 	set_page_writeback(page);
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
 				   &ci->i_layout, snapc,
 				   page_off, len,
-				   ci->i_truncate_seq, ci->i_truncate_size,
-				   &inode->i_mtime,
-				   &page, 1, 0, 0, true);
+				   truncate_seq, truncate_size,
+				   &inode->i_mtime, &page, 1);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
@@ -453,7 +543,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 	int err;
 	struct inode *inode = page->mapping->host;
 	BUG_ON(!inode);
-	igrab(inode);
+	ihold(inode);
 	err = writepage_nounlock(page, wbc);
 	unlock_page(page);
 	iput(inode);
@@ -478,7 +568,6 @@ static void ceph_release_pages(struct page **pages, int num)
 	pagevec_release(&pvec);
 }
 
-
 /*
  * async writeback completion handler.
  *
@@ -489,27 +578,24 @@ static void writepages_finish(struct ceph_osd_request *req,
 			      struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	struct ceph_osd_op *op;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_data *osd_data;
 	unsigned wrote;
 	struct page *page;
+	int num_pages;
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	__s32 rc = -EIO;
-	u64 bytes = 0;
+	int rc = req->r_result;
+	u64 bytes = req->r_ops[0].extent.length;
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	op = (void *)(replyhead + 1);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le64_to_cpu(op->extent.length);
-
+	osd_data = osd_req_op_extent_osd_data(req, 0);
+	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)osd_data->alignment,
+					(u64)osd_data->length);
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -517,7 +603,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 		 * raced with a truncation and was adjusted at the osd,
 		 * so don't believe the reply.
 		 */
-		wrote = req->r_num_pages;
+		wrote = num_pages;
 	} else {
 		wrote = 0;
 		mapping_set_error(mapping, rc);
@@ -526,8 +612,8 @@ static void writepages_finish(struct ceph_osd_request *req,
 	     inode, rc, bytes, wrote);
 
 	/* clean all pages */
-	for (i = 0; i < req->r_num_pages; i++) {
-		page = req->r_pages[i];
+	for (i = 0; i < num_pages; i++) {
+		page = osd_data->pages[i];
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
 
@@ -538,7 +624,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 			clear_bdi_congested(&fsc->backing_dev_info,
 					    BLK_RW_ASYNC);
 
-		ceph_put_snap_context((void *)page->private);
+		ceph_put_snap_context(page_snap_context(page));
 		page->private = 0;
 		ClearPagePrivate(page);
 		dout("unlocking %d %p\n", i, page);
@@ -556,35 +642,18 @@ static void writepages_finish(struct ceph_osd_request *req,
 		unlock_page(page);
 	}
 	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
 
-	ceph_release_pages(req->r_pages, req->r_num_pages);
-	if (req->r_pages_from_pool)
-		mempool_free(req->r_pages,
+	ceph_release_pages(osd_data->pages, num_pages);
+	if (osd_data->pages_from_pool)
+		mempool_free(osd_data->pages,
 			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
 	else
-		kfree(req->r_pages);
+		kfree(osd_data->pages);
 	ceph_osdc_put_request(req);
 }
 
 /*
- * allocate a page vec, either directly, or if necessary, via a the
- * mempool.  we avoid the mempool if we can because req->r_num_pages
- * may be less than the maximum write size.
- */
-static void alloc_page_vec(struct ceph_fs_client *fsc,
-			   struct ceph_osd_request *req)
-{
-	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
-			       GFP_NOFS);
-	if (!req->r_pages) {
-		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
-		req->r_pages_from_pool = 1;
-		WARN_ON(!req->r_pages);
-	}
-}
-
-/*
  * initiate async writeback
  */
 static int ceph_writepages_start(struct address_space *mapping,
@@ -592,7 +661,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_vino vino = ceph_vino(inode);
 	pgoff_t index, start, end;
 	int range_whole = 0;
 	int should_loop = 1;
@@ -604,24 +674,24 @@ static int ceph_writepages_start(struct address_space *mapping,
 	unsigned wsize = 1 << inode->i_blkbits;
 	struct ceph_osd_request *req = NULL;
 	int do_sync;
-	u64 snap_size = 0;
+	u64 truncate_size, snap_size;
+	u32 truncate_seq;
 
 	/*
 	 * Include a 'sync' in the OSD request if this is a data
 	 * integrity write (e.g., O_SYNC write or fsync()), or if our
 	 * cap is being revoked.
 	 */
-	do_sync = wbc->sync_mode == WB_SYNC_ALL;
-	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+	if ((wbc->sync_mode == WB_SYNC_ALL) ||
+		ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
 		do_sync = 1;
 	dout("writepages_start %p dosync=%d (mode=%s)\n",
 	     inode, do_sync,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	fsc = ceph_inode_to_client(inode);
 	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
-		pr_warning("writepage_start %p on forced umount\n", inode);
+		pr_warn("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
 	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -650,6 +720,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 retry:
 	/* find oldest snap context with dirty data */
 	ceph_put_snap_context(snapc);
+	snap_size = 0;
 	snapc = get_oldest_context(inode, &snap_size);
 	if (!snapc) {
 		/* hmm, why does writepages get called when there
@@ -657,8 +728,18 @@ retry:
 		dout(" no snap context with dirty data?\n");
 		goto out;
 	}
+	if (snap_size == 0)
+		snap_size = i_size_read(inode);
 	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
 	     snapc, snapc->seq, snapc->num_snaps);
+
+	spin_lock(&ci->i_ceph_lock);
+	truncate_seq = ci->i_truncate_seq;
+	truncate_size = ci->i_truncate_size;
+	if (!snap_size)
+		snap_size = i_size_read(inode);
+	spin_unlock(&ci->i_ceph_lock);
+
 	if (last_snapc && snapc != last_snapc) {
 		/* if we switched to a newer snapc, restart our scan at the
 		 * start of the original file range. */
@@ -669,15 +750,16 @@ retry:
 	last_snapc = snapc;
 
 	while (!done && index <= end) {
+		int num_ops = do_sync ? 2 : 1;
 		unsigned i;
 		int first;
 		pgoff_t next;
 		int pvec_pages, locked_pages;
+		struct page **pages = NULL;
+		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
 		struct page *page;
 		int want;
 		u64 offset, len;
-		struct ceph_osd_request_head *reqhead;
-		struct ceph_osd_op *op;
 		long writeback_stat;
 
 		next = 0;
@@ -726,11 +808,8 @@ get_more_pages:
 				dout("waiting on writeback %p\n", page);
 				wait_on_page_writeback(page);
 			}
-			if ((snap_size && page_offset(page) > snap_size) ||
-			    (!snap_size &&
-			     page_offset(page) > i_size_read(inode))) {
-				dout("%p page eof %llu\n", page, snap_size ?
-				     snap_size : i_size_read(inode));
+			if (page_offset(page) >= snap_size) {
+				dout("%p page eof %llu\n", page, snap_size);
 				done = 1;
 				unlock_page(page);
 				break;
@@ -742,7 +821,7 @@ get_more_pages:
 			}
 
 			/* only if matching snap context */
-			pgsnapc = (void *)page->private;
+			pgsnapc = page_snap_context(page);
 			if (pgsnapc->seq > snapc->seq) {
 				dout("page snapc %p %lld > oldest %p %lld\n",
 				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
@@ -758,28 +837,42 @@ get_more_pages:
 				break;
 			}
 
-			/* ok */
+			/*
+			 * We have something to write.  If this is
+			 * the first locked page this time through,
+			 * allocate an osd request and a page array
+			 * that it will use.
+			 */
 			if (locked_pages == 0) {
+				BUG_ON(pages);
 				/* prepare async write request */
-				offset = (unsigned long long)page->index
-					<< PAGE_CACHE_SHIFT;
+				offset = (u64)page_offset(page);
 				len = wsize;
 				req = ceph_osdc_new_request(&fsc->client->osdc,
-					    &ci->i_layout,
-					    ceph_vino(inode),
-					    offset, &len,
-					    CEPH_OSD_OP_WRITE,
-					    CEPH_OSD_FLAG_WRITE |
-						    CEPH_OSD_FLAG_ONDISK,
-					    snapc, do_sync,
-					    ci->i_truncate_seq,
-					    ci->i_truncate_size,
-					    &inode->i_mtime, true, 1, 0);
-				max_pages = req->r_num_pages;
-
-				alloc_page_vec(fsc, req);
+							&ci->i_layout, vino,
+							offset, &len, num_ops,
+							CEPH_OSD_OP_WRITE,
+							CEPH_OSD_FLAG_WRITE |
+							CEPH_OSD_FLAG_ONDISK,
+							snapc, truncate_seq,
+							truncate_size, true);
+				if (IS_ERR(req)) {
+					rc = PTR_ERR(req);
+					unlock_page(page);
+					break;
+				}
+
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
+
+				max_pages = calc_pages_for(0, (u64)len);
+				pages = kmalloc(max_pages * sizeof (*pages),
+						GFP_NOFS);
+				if (!pages) {
+					pool = fsc->wb_pagevec_pool;
+					pages = mempool_alloc(pool, GFP_NOFS);
+					BUG_ON(!pages);
+				}
 			}
 
 			/* note position of first page in pvec */
@@ -797,7 +890,7 @@ get_more_pages:
 			}
 
 			set_page_writeback(page);
-			req->r_pages[locked_pages] = page;
+			pages[locked_pages] = page;
 			locked_pages++;
 			next = page->index + 1;
 		}
@@ -826,22 +919,30 @@ get_more_pages:
 			pvec.nr -= i-first;
 		}
 
-		/* submit the write */
-		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
-		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+		/* Format the osd request message and submit the write */
+
+		offset = page_offset(pages[0]);
+		len = min(snap_size - offset,
 			  (u64)locked_pages << PAGE_CACHE_SHIFT);
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		/* revise final length, page count */
-		req->r_num_pages = locked_pages;
-		reqhead = req->r_request->front.iov_base;
-		op = (void *)(reqhead + 1);
-		op->extent.length = cpu_to_le64(len);
-		op->payload_len = cpu_to_le32(len);
-		req->r_request->hdr.data_len = cpu_to_le32(len);
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+							!!pool, false);
+
+		pages = NULL;	/* request message now owns the pages array */
+		pool = NULL;
+
+		/* Update the write op length in case we changed it */
 
-		ceph_osdc_start_request(&fsc->client->osdc, req, true);
+		osd_req_op_extent_update(req, 0, len);
+
+		vino = ceph_vino(inode);
+		ceph_osdc_build_request(req, offset, snapc, vino.snap,
+					&inode->i_mtime);
+
+		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+		BUG_ON(rc);
 		req = NULL;
 
 		/* continue? */
@@ -873,8 +974,6 @@ release_pvec_pages:
 out:
 	if (req)
 		ceph_osdc_put_request(req);
-	if (rc > 0)
-		rc = 0;  /* vfs expects us to return 0 */
 	ceph_put_snap_context(snapc);
 	dout("writepages done, rc = %d\n", rc);
 	return rc;
@@ -907,7 +1006,7 @@ static int ceph_update_writeable_page(struct file *file,
 			    loff_t pos, unsigned len,
 			    struct page *page)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
@@ -925,7 +1024,7 @@ retry_locked:
 	BUG_ON(!ci->i_snap_realm);
 	down_read(&mdsc->snap_rwsem);
 	BUG_ON(!ci->i_snap_realm->cached_context);
-	snapc = (void *)page->private;
+	snapc = page_snap_context(page);
 	if (snapc && snapc != ci->i_head_snapc) {
 		/*
 		 * this page is already dirty in another (older) snap
@@ -1016,7 +1115,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct page *page;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	int r;
@@ -1046,7 +1145,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned copied,
 			  struct page *page, void *fsdata)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1085,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
  * never get called.
  */
 static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
-			      const struct iovec *iov,
-			      loff_t pos, unsigned long nr_segs)
+			      struct iov_iter *iter,
+			      loff_t pos)
 {
 	WARN_ON(1);
 	return -EINVAL;
@@ -1109,27 +1208,83 @@ const struct address_space_operations ceph_aops = {
 /*
  * vm ops
  */
+static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = vma->vm_file->private_data;
+	loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+	int want, got, ret;
+
+	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+	     inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_CACHE;
+	while (1) {
+		got = 0;
+		ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+		if (ret == 0)
+			break;
+		if (ret != -ERESTARTSYS) {
+			WARN_ON(1);
+			return VM_FAULT_SIGBUS;
+		}
+	}
+	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+
+	ret = filemap_fault(vma, vmf);
+
+	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
+	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+	ceph_put_cap_refs(ci, got);
+
+	return ret;
+}
 
 /*
  * Reuse write_begin here for simplicity.
  */
 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
-	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *fi = vma->vm_file->private_data;
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-	loff_t off = page->index << PAGE_CACHE_SHIFT;
-	loff_t size, len;
-	int ret;
+	struct page *page = vmf->page;
+	loff_t off = page_offset(page);
+	loff_t size = i_size_read(inode);
+	size_t len;
+	int want, got, ret;
 
-	size = i_size_read(inode);
 	if (off + PAGE_CACHE_SIZE <= size)
 		len = PAGE_CACHE_SIZE;
 	else
 		len = size & ~PAGE_CACHE_MASK;
 
-	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
-	     off, len, page, page->index);
+	dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
+	     inode, ceph_vinop(inode), off, len, size);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	while (1) {
+		got = 0;
+		ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
+		if (ret == 0)
+			break;
+		if (ret != -ERESTARTSYS) {
+			WARN_ON(1);
+			return VM_FAULT_SIGBUS;
+		}
+	}
+	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+	     inode, off, len, ceph_cap_string(got));
+
+	/* Update time before taking page lock */
+	file_update_time(vma->vm_file);
 
 	lock_page(page);
 
@@ -1151,15 +1306,28 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			ret = VM_FAULT_SIGBUS;
 	}
 out:
-	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
-	if (ret != VM_FAULT_LOCKED)
+	if (ret != VM_FAULT_LOCKED) {
 		unlock_page(page);
+	} else {
+		int dirty;
+		spin_lock(&ci->i_ceph_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
+	     inode, off, len, ceph_cap_string(got), ret);
+	ceph_put_cap_refs(ci, got);
+
 	return ret;
 }
 
 static struct vm_operations_struct ceph_vmops = {
-	.fault		= filemap_fault,
+	.fault		= ceph_filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1170,6 +1338,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &ceph_vmops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
new file mode 100644
index 00000000000..834f9f3723f
--- /dev/null
+++ b/fs/ceph/cache.c
@@ -0,0 +1,402 @@
+/*
+ * Ceph cache definitions.
+ *
+ *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ *  Written by Milosz Tanski (milosz@adfin.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include "super.h"
+#include "cache.h"
+
+struct ceph_aux_inode {
+	struct timespec	mtime;
+	loff_t          size;
+};
+
+struct fscache_netfs ceph_cache_netfs = {
+	.name		= "ceph",
+	.version	= 0,
+};
+
+static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
+					     void *buffer, uint16_t maxbuf)
+{
+	const struct ceph_fs_client* fsc = cookie_netfs_data;
+	uint16_t klen;
+
+	klen = sizeof(fsc->client->fsid);
+	if (klen > maxbuf)
+		return 0;
+
+	memcpy(buffer, &fsc->client->fsid, klen);
+	return klen;
+}
+
+static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
+	.name		= "CEPH.fsid",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= ceph_fscache_session_get_key,
+};
+
+int ceph_fscache_register(void)
+{
+	return fscache_register_netfs(&ceph_cache_netfs);
+}
+
+void ceph_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&ceph_cache_netfs);
+}
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
+					      &ceph_fscache_fsid_object_def,
+					      fsc, true);
+
+	if (fsc->fscache == NULL) {
+		pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+		return 0;
+	}
+
+	fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
+	if (fsc->revalidate_wq == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t maxbuf)
+{
+	const struct ceph_inode_info* ci = cookie_netfs_data;
+	uint16_t klen;
+
+	/* use ceph virtual inode (id + snaphot) */
+	klen = sizeof(ci->i_vino);
+	if (klen > maxbuf)
+		return 0;
+
+	memcpy(buffer, &ci->i_vino, klen);
+	return klen;
+}
+
+static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
+					   void *buffer, uint16_t bufmax)
+{
+	struct ceph_aux_inode aux;
+	const struct ceph_inode_info* ci = cookie_netfs_data;
+	const struct inode* inode = &ci->vfs_inode;
+
+	memset(&aux, 0, sizeof(aux));
+	aux.mtime = inode->i_mtime;
+	aux.size = inode->i_size;
+
+	memcpy(buffer, &aux, sizeof(aux));
+
+	return sizeof(aux);
+}
+
+static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
+					uint64_t *size)
+{
+	const struct ceph_inode_info* ci = cookie_netfs_data;
+	const struct inode* inode = &ci->vfs_inode;
+
+	*size = inode->i_size;
+}
+
+static enum fscache_checkaux ceph_fscache_inode_check_aux(
+	void *cookie_netfs_data, const void *data, uint16_t dlen)
+{
+	struct ceph_aux_inode aux;
+	struct ceph_inode_info* ci = cookie_netfs_data;
+	struct inode* inode = &ci->vfs_inode;
+
+	if (dlen != sizeof(aux))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&aux, 0, sizeof(aux));
+	aux.mtime = inode->i_mtime;
+	aux.size = inode->i_size;
+
+	if (memcmp(data, &aux, sizeof(aux)) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	dout("ceph inode 0x%p cached okay", ci);
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
+{
+	struct ceph_inode_info* ci = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	dout("ceph inode 0x%p now uncached", ci);
+
+	while (1) {
+		nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
+	.name		= "CEPH.inode",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= ceph_fscache_inode_get_key,
+	.get_attr	= ceph_fscache_inode_get_attr,
+	.get_aux	= ceph_fscache_inode_get_aux,
+	.check_aux	= ceph_fscache_inode_check_aux,
+	.now_uncached	= ceph_fscache_inode_now_uncached,
+};
+
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+					struct ceph_inode_info* ci)
+{
+	struct inode* inode = &ci->vfs_inode;
+
+	/* No caching for filesystem */
+	if (fsc->fscache == NULL)
+		return;
+
+	/* Only cache for regular files that are read only */
+	if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+		return;
+
+	/* Avoid multiple racing open requests */
+	mutex_lock(&inode->i_mutex);
+
+	if (ci->fscache)
+		goto done;
+
+	ci->fscache = fscache_acquire_cookie(fsc->fscache,
+					     &ceph_fscache_inode_object_def,
+					     ci, true);
+	fscache_check_consistency(ci->fscache);
+done:
+	mutex_unlock(&inode->i_mutex);
+
+}
+
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+	struct fscache_cookie* cookie;
+
+	if ((cookie = ci->fscache) == NULL)
+		return;
+
+	ci->fscache = NULL;
+
+	fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
+	fscache_relinquish_cookie(cookie, 0);
+}
+
+static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+}
+
+static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+
+	unlock_page(page);
+}
+
+static inline int cache_valid(struct ceph_inode_info *ci)
+{
+	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
+		(ci->i_fscache_gen == ci->i_rdcache_gen));
+}
+
+
+/* Atempt to read from the fscache,
+ *
+ * This function is called from the readpage_nounlock context. DO NOT attempt to
+ * unlock the page here (or in the callback).
+ */
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!cache_valid(ci))
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_page(ci->fscache, page,
+					 ceph_vfs_readpage_complete, NULL,
+					 GFP_KERNEL);
+
+	switch (ret) {
+		case 0: /* Page found */
+			dout("page read submitted\n");
+			return 0;
+		case -ENOBUFS: /* Pages were not found, and can't be */
+		case -ENODATA: /* Pages were not found */
+			dout("page/inode not in cache\n");
+			return ret;
+		default:
+			dout("%s: unknown error ret = %i\n", __func__, ret);
+			return ret;
+	}
+}
+
+int ceph_readpages_from_fscache(struct inode *inode,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!cache_valid(ci))
+		return -ENOBUFS;
+
+	ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
+					  ceph_vfs_readpage_complete_unlock,
+					  NULL, mapping_gfp_mask(mapping));
+
+	switch (ret) {
+		case 0: /* All pages found */
+			dout("all-page read submitted\n");
+			return 0;
+		case -ENOBUFS: /* Some pages were not found, and can't be */
+		case -ENODATA: /* some pages were not found */
+			dout("page/inode not in cache\n");
+			return ret;
+		default:
+			dout("%s: unknown error ret = %i\n", __func__, ret);
+			return ret;
+	}
+}
+
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!PageFsCache(page))
+		return;
+
+	if (!cache_valid(ci))
+		return;
+
+	ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
+	if (ret)
+		 fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (!PageFsCache(page))
+		return;
+
+	fscache_wait_on_page_write(ci->fscache, page);
+	fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+	if (fsc->revalidate_wq)
+		destroy_workqueue(fsc->revalidate_wq);
+
+	fscache_relinquish_cookie(fsc->fscache, 0);
+	fsc->fscache = NULL;
+}
+
+static void ceph_revalidate_work(struct work_struct *work)
+{
+	int issued;
+	u32 orig_gen;
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_revalidate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&ci->i_ceph_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (!(issued & CEPH_CAP_FILE_CACHE)) {
+		dout("revalidate_work lost cache before validation %p\n",
+		     inode);
+		goto out;
+	}
+
+	if (!fscache_check_consistency(ci->fscache))
+		fscache_invalidate(ci->fscache);
+
+	spin_lock(&ci->i_ceph_lock);
+	/* Update the new valid generation (backwards sanity check too) */
+	if (orig_gen > ci->i_fscache_gen) {
+		ci->i_fscache_gen = orig_gen;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+out:
+	iput(&ci->vfs_inode);
+}
+
+void ceph_queue_revalidate(struct inode *inode)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
+		return;
+
+	ihold(inode);
+
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
+		       &ci->i_revalidate_work)) {
+		dout("ceph_queue_revalidate %p\n", inode);
+	} else {
+		dout("ceph_queue_revalidate %p failed\n)", inode);
+		iput(inode);
+	}
+}
+
+void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+	ci->fscache = NULL;
+	/* The first load is verifed cookie open time */
+	ci->i_fscache_gen = 1;
+	INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
+}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
new file mode 100644
index 00000000000..5ac591bd012
--- /dev/null
+++ b/fs/ceph/cache.h
@@ -0,0 +1,182 @@
+/*
+ * Ceph cache definitions.
+ *
+ *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ *  Written by Milosz Tanski (milosz@adfin.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#ifndef _CEPH_CACHE_H
+#define _CEPH_CACHE_H
+
+#ifdef CONFIG_CEPH_FSCACHE
+
+extern struct fscache_netfs ceph_cache_netfs;
+
+int ceph_fscache_register(void);
+void ceph_fscache_unregister(void);
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
+
+void ceph_fscache_inode_init(struct ceph_inode_info *ci);
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+					struct ceph_inode_info* ci);
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
+int ceph_readpages_from_fscache(struct inode *inode,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages);
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+void ceph_queue_revalidate(struct inode *inode);
+
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	fscache_attr_changed(ci->fscache);
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+	fscache_invalidate(ceph_inode(inode)->fscache);
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+					     struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return fscache_uncache_page(ci->fscache, page);
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+	struct inode* inode = page->mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return fscache_maybe_release_page(ci->fscache, page, gfp);
+}
+
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+						struct page *page)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+		__fscache_uncache_page(ci->fscache, page);
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return fscache_readpages_cancel(ci->fscache, pages);
+}
+
+#else
+
+static inline int ceph_fscache_register(void)
+{
+	return 0;
+}
+
+static inline void ceph_fscache_unregister(void)
+{
+}
+
+static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+	return 0;
+}
+
+static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+}
+
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+}
+
+static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
+						      struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+					     struct page *pages)
+{
+}
+
+static inline int ceph_readpage_from_fscache(struct inode* inode,
+					     struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int ceph_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void ceph_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+}
+
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+}
+
+static inline void ceph_invalidate_fscache_page(struct inode *inode,
+						struct page *page)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+	return 1;
+}
+
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+						struct page *page)
+{
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+}
+
+static inline void ceph_queue_revalidate(struct inode *inode)
+{
+}
+
+#endif
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6b61ded701e..1fde164b74b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
 
 #include "super.h"
 #include "mds_client.h"
+#include "cache.h"
 #include <linux/ceph/decode.h>
 #include <linux/ceph/messenger.h>
 
@@ -147,7 +148,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
 		      struct ceph_cap_reservation *ctx, int need)
 {
 	int i;
@@ -155,7 +156,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 	int have;
 	int alloc = 0;
 	LIST_HEAD(newcaps);
-	int ret = 0;
 
 	dout("reserve caps ctx=%p need=%d\n", ctx, need);
 
@@ -174,14 +174,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 
 	for (i = have; i < need; i++) {
 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
-		if (!cap) {
-			ret = -ENOMEM;
-			goto out_alloc_count;
-		}
+		if (!cap)
+			break;
 		list_add(&cap->caps_item, &newcaps);
 		alloc++;
 	}
-	BUG_ON(have + alloc != need);
+	/* we didn't manage to reserve as much as we needed */
+	if (have + alloc != need)
+		pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+			ctx, need, have + alloc);
 
 	spin_lock(&mdsc->caps_list_lock);
 	mdsc->caps_total_count += alloc;
@@ -197,13 +198,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
 	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
 	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
-	return 0;
-
-out_alloc_count:
-	/* we didn't manage to reserve as much as we needed */
-	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
-		   ctx, need, have);
-	return ret;
 }
 
 int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -227,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 	return 0;
 }
 
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
-				struct ceph_cap_reservation *ctx)
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+			      struct ceph_cap_reservation *ctx)
 {
 	struct ceph_cap *cap = NULL;
 
@@ -236,8 +230,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
 	if (!ctx) {
 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 		if (cap) {
+			spin_lock(&mdsc->caps_list_lock);
 			mdsc->caps_use_count++;
 			mdsc->caps_total_count++;
+			spin_unlock(&mdsc->caps_list_lock);
 		}
 		return cap;
 	}
@@ -309,7 +305,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
 /*
  * Find ceph_cap for given mds, if any.
  *
- * Called with i_lock held.
+ * Called with i_ceph_lock held.
  */
 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 {
@@ -332,9 +328,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 {
 	struct ceph_cap *cap;
 
-	spin_lock(&ci->vfs_inode.i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	cap = __get_cap_for_mds(ci, mds);
-	spin_unlock(&ci->vfs_inode.i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return cap;
 }
 
@@ -361,15 +357,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
 
 int ceph_get_cap_mds(struct inode *inode)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds;
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	mds = __ceph_get_cap_mds(ceph_inode(inode));
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return mds;
 }
 
 /*
- * Called under i_lock.
+ * Called under i_ceph_lock.
  */
 static void __insert_cap_node(struct ceph_inode_info *ci,
 			      struct ceph_cap *new)
@@ -415,7 +412,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
  *
  * If I_FLUSH is set, leave the inode at the front of the list.
  *
- * Caller holds i_lock
+ * Caller holds i_ceph_lock
  *    -> we take mdsc->cap_delay_lock
  */
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
@@ -457,7 +454,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 /*
  * Cancel delayed work on cap.
  *
- * Caller must hold i_lock.
+ * Caller must hold i_ceph_lock.
  */
 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
@@ -483,11 +480,12 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 	 * i_rdcache_gen.
 	 */
 	if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
+	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
 		ci->i_rdcache_gen++;
+	}
 
 	/*
-	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+	 * if we are newly issued FILE_SHARED, mark dir not complete; we
 	 * don't know what happened to this directory while we didn't
 	 * have the cap.
 	 */
@@ -496,7 +494,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 		ci->i_shared_gen++;
 		if (S_ISDIR(ci->vfs_inode.i_mode)) {
 			dout(" marking %p NOT complete\n", &ci->vfs_inode);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			__ceph_dir_clear_complete(ci);
 		}
 	}
 }
@@ -510,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
  * it is < 0.  (This is so we can atomically add the cap and add an
  * open file reference to it.)
  */
-int ceph_add_cap(struct inode *inode,
-		 struct ceph_mds_session *session, u64 cap_id,
-		 int fmode, unsigned issued, unsigned wanted,
-		 unsigned seq, unsigned mseq, u64 realmino, int flags,
-		 struct ceph_cap_reservation *caps_reservation)
+void ceph_add_cap(struct inode *inode,
+		  struct ceph_mds_session *session, u64 cap_id,
+		  int fmode, unsigned issued, unsigned wanted,
+		  unsigned seq, unsigned mseq, u64 realmino, int flags,
+		  struct ceph_cap **new_cap)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_cap *new_cap = NULL;
 	struct ceph_cap *cap;
 	int mds = session->s_mds;
 	int actual_wanted;
@@ -533,42 +530,44 @@ int ceph_add_cap(struct inode *inode,
 	if (fmode >= 0)
 		wanted |= ceph_caps_for_mode(fmode);
 
-retry:
-	spin_lock(&inode->i_lock);
 	cap = __get_cap_for_mds(ci, mds);
 	if (!cap) {
-		if (new_cap) {
-			cap = new_cap;
-			new_cap = NULL;
-		} else {
-			spin_unlock(&inode->i_lock);
-			new_cap = get_cap(mdsc, caps_reservation);
-			if (new_cap == NULL)
-				return -ENOMEM;
-			goto retry;
-		}
+		cap = *new_cap;
+		*new_cap = NULL;
 
 		cap->issued = 0;
 		cap->implemented = 0;
 		cap->mds = mds;
 		cap->mds_wanted = 0;
+		cap->mseq = 0;
 
 		cap->ci = ci;
 		__insert_cap_node(ci, cap);
 
-		/* clear out old exporting info?  (i.e. on cap import) */
-		if (ci->i_cap_exporting_mds == mds) {
-			ci->i_cap_exporting_issued = 0;
-			ci->i_cap_exporting_mseq = 0;
-			ci->i_cap_exporting_mds = -1;
-		}
-
 		/* add to session cap list */
 		cap->session = session;
 		spin_lock(&session->s_cap_lock);
 		list_add_tail(&cap->session_caps, &session->s_caps);
 		session->s_nr_caps++;
 		spin_unlock(&session->s_cap_lock);
+	} else {
+		/*
+		 * auth mds of the inode changed. we received the cap export
+		 * message, but still haven't received the cap import message.
+		 * handle_cap_export() updated the new auth MDS' cap.
+		 *
+		 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+		 * a message that was send before the cap import message. So
+		 * don't remove caps.
+		 */
+		if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+			WARN_ON(cap != ci->i_auth_cap);
+			WARN_ON(cap->cap_id != cap_id);
+			seq = cap->seq;
+			mseq = cap->mseq;
+			issued |= cap->issued;
+			flags |= CEPH_CAP_FLAG_AUTH;
+		}
 	}
 
 	if (!ci->i_snap_realm) {
@@ -607,10 +606,15 @@ retry:
 		__cap_delay_requeue(mdsc, ci);
 	}
 
-	if (flags & CEPH_CAP_FLAG_AUTH)
-		ci->i_auth_cap = cap;
-	else if (ci->i_auth_cap == cap)
-		ci->i_auth_cap = NULL;
+	if (flags & CEPH_CAP_FLAG_AUTH) {
+		if (ci->i_auth_cap == NULL ||
+		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
+			ci->i_auth_cap = cap;
+			cap->mds_wanted = wanted;
+		}
+	} else {
+		WARN_ON(ci->i_auth_cap == cap);
+	}
 
 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -618,7 +622,10 @@ retry:
 	cap->cap_id = cap_id;
 	cap->issued = issued;
 	cap->implemented |= issued;
-	cap->mds_wanted |= wanted;
+	if (ceph_seq_cmp(mseq, cap->mseq) > 0)
+		cap->mds_wanted = wanted;
+	else
+		cap->mds_wanted |= wanted;
 	cap->seq = seq;
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
@@ -626,9 +633,6 @@ retry:
 
 	if (fmode >= 0)
 		__ceph_get_fmode(ci, fmode);
-	spin_unlock(&inode->i_lock);
-	wake_up_all(&ci->i_cap_wq);
-	return 0;
 }
 
 /*
@@ -641,10 +645,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
 	unsigned long ttl;
 	u32 gen;
 
-	spin_lock(&cap->session->s_cap_lock);
+	spin_lock(&cap->session->s_gen_ttl_lock);
 	gen = cap->session->s_cap_gen;
 	ttl = cap->session->s_cap_ttl;
-	spin_unlock(&cap->session->s_cap_lock);
+	spin_unlock(&cap->session->s_gen_ttl_lock);
 
 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 		dout("__cap_is_valid %p cap %p issued %s "
@@ -663,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
  */
 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 {
-	int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+	int have = ci->i_snap_caps;
 	struct ceph_cap *cap;
 	struct rb_node *p;
 
@@ -679,6 +683,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 		if (implemented)
 			*implemented |= cap->implemented;
 	}
+	/*
+	 * exclude caps issued by non-auth MDS, but are been revoking
+	 * by the auth MDS. The non-auth MDS should be revoking/exporting
+	 * these caps, but the message is delayed.
+	 */
+	if (ci->i_auth_cap) {
+		cap = ci->i_auth_cap;
+		have &= ~cap->implemented | cap->issued;
+	}
 	return have;
 }
 
@@ -765,7 +778,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 			if (touch) {
 				struct rb_node *q;
 
-				/* touch this + preceeding caps */
+				/* touch this + preceding caps */
 				__touch_cap(cap);
 				for (q = rb_first(&ci->i_caps); q != p;
 				     q = rb_next(q)) {
@@ -786,23 +799,29 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 /*
  * Return true if mask caps are currently being revoked by an MDS.
  */
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+			       struct ceph_cap *ocap, int mask)
 {
-	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
 	struct rb_node *p;
-	int ret = 0;
 
-	spin_lock(&inode->i_lock);
 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 		cap = rb_entry(p, struct ceph_cap, ci_node);
-		if (__cap_is_valid(cap) &&
-		    (cap->implemented & ~cap->issued & mask)) {
-			ret = 1;
-			break;
-		}
+		if (cap != ocap &&
+		    (cap->implemented & ~cap->issued & mask))
+			return 1;
 	}
-	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_caps_revoking_other(ci, NULL, mask);
+	spin_unlock(&ci->i_ceph_lock);
 	dout("ceph_caps_revoking %p %s = %d\n", inode,
 	     ceph_cap_string(mask), ret);
 	return ret;
@@ -819,7 +838,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
 		used |= CEPH_CAP_FILE_CACHE;
 	if (ci->i_wr_ref)
 		used |= CEPH_CAP_FILE_WR;
-	if (ci->i_wrbuffer_ref)
+	if (ci->i_wb_ref || ci->i_wrbuffer_ref)
 		used |= CEPH_CAP_FILE_BUFFER;
 	return used;
 }
@@ -850,26 +869,41 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 		cap = rb_entry(p, struct ceph_cap, ci_node);
 		if (!__cap_is_valid(cap))
 			continue;
-		mds_wanted |= cap->mds_wanted;
+		if (cap == ci->i_auth_cap)
+			mds_wanted |= cap->mds_wanted;
+		else
+			mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
 	}
 	return mds_wanted;
 }
 
 /*
- * called under i_lock
+ * called under i_ceph_lock
  */
 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 {
-	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+int ceph_is_any_caps(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_is_any_caps(ci);
+	spin_unlock(&ci->i_ceph_lock);
+
+	return ret;
 }
 
 /*
  * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
  *
- * caller should hold i_lock.
+ * caller should hold i_ceph_lock.
  * caller will not hold session s_mutex if called from destroy_inode.
  */
-void __ceph_remove_cap(struct ceph_cap *cap)
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 {
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
@@ -881,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 
 	/* remove from session list */
 	spin_lock(&session->s_cap_lock);
+	/*
+	 * s_cap_reconnect is protected by s_cap_lock. no one changes
+	 * s_cap_gen while session is in the reconnect state.
+	 */
+	if (queue_release &&
+	    (!session->s_cap_reconnect ||
+	     cap->cap_gen == session->s_cap_gen))
+		__queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
+				    cap->mseq, cap->issue_seq);
+
 	if (session->s_cap_iterator == cap) {
 		/* not yet, we are iterating over this very cap */
 		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
@@ -928,7 +972,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 			u64 size, u64 max_size,
 			struct timespec *mtime, struct timespec *atime,
 			u64 time_warp_seq,
-			uid_t uid, gid_t gid, mode_t mode,
+			kuid_t uid, kgid_t gid, umode_t mode,
 			u64 xattr_version,
 			struct ceph_buffer *xattrs_buf,
 			u64 follows)
@@ -944,7 +988,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	     seq, issue_seq, mseq, follows, size, max_size,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
 	if (!msg)
 		return -ENOMEM;
 
@@ -972,8 +1016,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
 		ceph_encode_timespec(&fc->atime, atime);
 	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
 
-	fc->uid = cpu_to_le32(uid);
-	fc->gid = cpu_to_le32(gid);
+	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
+	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
 	fc->mode = cpu_to_le32(mode);
 
 	fc->xattr_version = cpu_to_le64(xattr_version);
@@ -987,15 +1031,14 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	return 0;
 }
 
-static void __queue_cap_release(struct ceph_mds_session *session,
-				u64 ino, u64 cap_id, u32 migrate_seq,
-				u32 issue_seq)
+void __queue_cap_release(struct ceph_mds_session *session,
+			 u64 ino, u64 cap_id, u32 migrate_seq,
+			 u32 issue_seq)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_cap_release *head;
 	struct ceph_mds_cap_item *item;
 
-	spin_lock(&session->s_cap_lock);
 	BUG_ON(!session->s_num_cap_releases);
 	msg = list_first_entry(&session->s_cap_releases,
 			       struct ceph_msg, list_head);
@@ -1005,7 +1048,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
 
 	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
 	head = msg->front.iov_base;
-	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+	le32_add_cpu(&head->num, 1);
 	item = msg->front.iov_base + msg->front.iov_len;
 	item->ino = cpu_to_le64(ino);
 	item->cap_id = cpu_to_le64(cap_id);
@@ -1024,12 +1067,11 @@ static void __queue_cap_release(struct ceph_mds_session *session,
 		     (int)CEPH_CAPS_PER_RELEASE,
 		     (int)msg->front.iov_len);
 	}
-	spin_unlock(&session->s_cap_lock);
 }
 
 /*
  * Queue cap releases when an inode is dropped from our cache.  Since
- * inode is about to be destroyed, there is no need for i_lock.
+ * inode is about to be destroyed, there is no need for i_ceph_lock.
  */
 void ceph_queue_caps_release(struct inode *inode)
 {
@@ -1039,18 +1081,14 @@ void ceph_queue_caps_release(struct inode *inode)
 	p = rb_first(&ci->i_caps);
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
-		struct ceph_mds_session *session = cap->session;
-
-		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
-				    cap->mseq, cap->issue_seq);
 		p = rb_next(p);
-		__ceph_remove_cap(cap);
+		__ceph_remove_cap(cap, true);
 	}
 }
 
 /*
  * Send a cap msg on the given inode.  Update our caps state, then
- * drop i_lock and send the message.
+ * drop i_ceph_lock and send the message.
  *
  * Make note of max_size reported/requested from mds, revoked caps
  * that have now been implemented.
@@ -1062,13 +1100,13 @@ void ceph_queue_caps_release(struct inode *inode)
  * Return non-zero if delayed release, or we experienced an error
  * such that the caller should requeue + retry later.
  *
- * called with i_lock, then drops it.
+ * called with i_ceph_lock, then drops it.
  * caller should hold snap_rwsem (read), s_mutex.
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 		      int op, int used, int want, int retain, int flushing,
 		      unsigned *pflush_tid)
-	__releases(cap->ci->vfs_inode->i_lock)
+	__releases(cap->ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = cap->ci;
 	struct inode *inode = &ci->vfs_inode;
@@ -1078,9 +1116,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	u64 size, max_size;
 	struct timespec mtime, atime;
 	int wake = 0;
-	mode_t mode;
-	uid_t uid;
-	gid_t gid;
+	umode_t mode;
+	kuid_t uid;
+	kgid_t gid;
 	struct ceph_mds_session *session;
 	u64 xattr_version = 0;
 	struct ceph_buffer *xattr_blob = NULL;
@@ -1171,7 +1209,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 		xattr_version = ci->i_xattrs.version;
 	}
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
 	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
 		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
@@ -1199,13 +1237,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * Unless @again is true, skip cap_snaps that were already sent to
  * the MDS (i.e., during this session).
  *
- * Called under i_lock.  Takes s_mutex as needed.
+ * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
 			struct ceph_mds_session **psession,
 			int again)
-		__releases(ci->vfs_inode->i_lock)
-		__acquires(ci->vfs_inode->i_lock)
+		__releases(ci->i_ceph_lock)
+		__acquires(ci->i_ceph_lock)
 {
 	struct inode *inode = &ci->vfs_inode;
 	int mds;
@@ -1262,7 +1300,7 @@ retry:
 			session = NULL;
 		}
 		if (!session) {
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 			mutex_lock(&mdsc->mutex);
 			session = __ceph_lookup_mds_session(mdsc, mds);
 			mutex_unlock(&mdsc->mutex);
@@ -1276,7 +1314,7 @@ retry:
 			 * deletion or migration.  retry, and we'll
 			 * get a better @mds value next time.
 			 */
-			spin_lock(&inode->i_lock);
+			spin_lock(&ci->i_ceph_lock);
 			goto retry;
 		}
 
@@ -1286,7 +1324,7 @@ retry:
 			list_del_init(&capsnap->flushing_item);
 		list_add_tail(&capsnap->flushing_item,
 			      &session->s_cap_snaps_flushing);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 
 		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
 		     inode, capsnap, capsnap->follows, capsnap->flush_tid);
@@ -1303,7 +1341,7 @@ retry:
 		next_follows = capsnap->follows + 1;
 		ceph_put_cap_snap(capsnap);
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		goto retry;
 	}
 
@@ -1323,18 +1361,17 @@ out:
 
 static void ceph_flush_snaps(struct ceph_inode_info *ci)
 {
-	struct inode *inode = &ci->vfs_inode;
-
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	__ceph_flush_snaps(ci, NULL, 0);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 }
 
 /*
- * Mark caps dirty.  If inode is newly dirty, add to the global dirty
- * list.
+ * Mark caps dirty.  If inode is newly dirty, return the dirty flags.
+ * Caller is then responsible for calling __mark_inode_dirty with the
+ * returned flags value.
  */
-void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
 	struct ceph_mds_client *mdsc =
 		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1350,14 +1387,15 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 		if (!ci->i_head_snapc)
 			ci->i_head_snapc = ceph_get_snap_context(
 				ci->i_snap_realm->cached_context);
-		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
-			ci->i_head_snapc);
+		dout(" inode %p now dirty snapc %p auth cap %p\n",
+		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+		WARN_ON(!ci->i_auth_cap);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
 		spin_unlock(&mdsc->cap_dirty_lock);
 		if (ci->i_flushing_caps == 0) {
-			igrab(inode);
+			ihold(inode);
 			dirty |= I_DIRTY_SYNC;
 		}
 	}
@@ -1365,16 +1403,15 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
 	    (mask & CEPH_CAP_FILE_BUFFER))
 		dirty |= I_DIRTY_DATASYNC;
-	if (dirty)
-		__mark_inode_dirty(inode, dirty);
 	__cap_delay_requeue(mdsc, ci);
+	return dirty;
 }
 
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
  *
- * Called under i_lock.
+ * Called under i_ceph_lock.
  */
 static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
@@ -1422,9 +1459,9 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u32 invalidating_gen = ci->i_rdcache_gen;
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	invalidate_mapping_pages(&inode->i_data, 0, -1);
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 
 	if (inode->i_data.nrpages == 0 &&
 	    invalidating_gen == ci->i_rdcache_gen) {
@@ -1456,7 +1493,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
-	int file_wanted, used;
+	int file_wanted, used, cap_used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
@@ -1471,7 +1508,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	if (mdsc->stopping)
 		is_delayed = 1;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 
 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
 		flags |= CHECK_CAPS_FLUSH;
@@ -1481,7 +1518,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		__ceph_flush_snaps(ci, &session, 0);
 	goto retry_locked;
 retry:
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 retry_locked:
 	file_wanted = __ceph_caps_file_wanted(ci);
 	used = __ceph_caps_used(ci);
@@ -1559,9 +1596,14 @@ retry_locked:
 
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
+		cap_used = used;
+		if (ci->i_auth_cap && cap != ci->i_auth_cap)
+			cap_used &= ~ci->i_auth_cap->issued;
+
 		revoking = cap->implemented & ~cap->issued;
-		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
 		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap_used),
 		     ceph_cap_string(cap->implemented),
 		     ceph_cap_string(revoking));
 
@@ -1589,7 +1631,7 @@ retry_locked:
 		}
 
 		/* completed revocation? going down and there are no caps? */
-		if (revoking && (revoking & used) == 0) {
+		if (revoking && (revoking & cap_used) == 0) {
 			dout("completed revocation of %s\n",
 			     ceph_cap_string(cap->implemented & ~cap->issued));
 			goto ack;
@@ -1635,7 +1677,7 @@ ack:
 			if (mutex_trylock(&session->s_mutex) == 0) {
 				dout("inverting session/ino locks on %p\n",
 				     session);
-				spin_unlock(&inode->i_lock);
+				spin_unlock(&ci->i_ceph_lock);
 				if (took_snap_rwsem) {
 					up_read(&mdsc->snap_rwsem);
 					took_snap_rwsem = 0;
@@ -1649,7 +1691,7 @@ ack:
 			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
 				dout("inverting snap/in locks on %p\n",
 				     inode);
-				spin_unlock(&inode->i_lock);
+				spin_unlock(&ci->i_ceph_lock);
 				down_read(&mdsc->snap_rwsem);
 				took_snap_rwsem = 1;
 				goto retry;
@@ -1665,10 +1707,10 @@ ack:
 		mds = cap->mds;  /* remember mds, so we don't repeat */
 		sent++;
 
-		/* __send_cap drops i_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
-				      retain, flushing, NULL);
-		goto retry; /* retake i_lock and restart our cap scan. */
+		/* __send_cap drops i_ceph_lock */
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+				      want, retain, flushing, NULL);
+		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
 	/*
@@ -1682,7 +1724,7 @@ ack:
 	else if (!is_delayed || force_requeue)
 		__cap_delay_requeue(mdsc, ci);
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
 	if (queue_invalidate)
 		ceph_queue_invalidate(inode);
@@ -1696,16 +1738,15 @@ ack:
 /*
  * Try to flush dirty caps back to the auth mds.
  */
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
-			  unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
+	struct ceph_mds_session *session = NULL;
 
 retry:
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
 		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
 		goto out;
@@ -1716,32 +1757,33 @@ retry:
 		int want = __ceph_caps_wanted(ci);
 		int delayed;
 
-		if (!session) {
-			spin_unlock(&inode->i_lock);
+		if (!session || session != cap->session) {
+			spin_unlock(&ci->i_ceph_lock);
+			if (session)
+				mutex_unlock(&session->s_mutex);
 			session = cap->session;
 			mutex_lock(&session->s_mutex);
 			goto retry;
 		}
-		BUG_ON(session != cap->session);
 		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
 			goto out;
 
 		flushing = __mark_caps_flushing(inode, session);
 
-		/* __send_cap drops i_lock */
+		/* __send_cap drops i_ceph_lock */
 		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
 				     cap->issued | cap->implemented, flushing,
 				     flush_tid);
 		if (!delayed)
 			goto out_unlocked;
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		__cap_delay_requeue(mdsc, ci);
 	}
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 out_unlocked:
-	if (session && unlock_session)
+	if (session)
 		mutex_unlock(&session->s_mutex);
 	return flushing;
 }
@@ -1754,7 +1796,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int i, ret = 1;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	for (i = 0; i < CEPH_CAP_BITS; i++)
 		if ((ci->i_flushing_caps & (1 << i)) &&
 		    ci->i_cap_flush_tid[i] <= tid) {
@@ -1762,7 +1804,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
 			ret = 0;
 			break;
 		}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return ret;
 }
 
@@ -1810,7 +1852,7 @@ out:
 	spin_unlock(&ci->i_unsafe_lock);
 }
 
-int ceph_fsync(struct file *file, int datasync)
+int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1821,11 +1863,12 @@ int ceph_fsync(struct file *file, int datasync)
 	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 	sync_write_wait(inode);
 
-	ret = filemap_write_and_wait(inode->i_mapping);
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret < 0)
 		return ret;
+	mutex_lock(&inode->i_mutex);
 
-	dirty = try_flush_caps(inode, NULL, &flush_tid);
+	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
 	/*
@@ -1840,6 +1883,7 @@ int ceph_fsync(struct file *file, int datasync)
 	}
 
 	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
@@ -1859,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	dout("write_inode %p wait=%d\n", inode, wait);
 	if (wait) {
-		dirty = try_flush_caps(inode, NULL, &flush_tid);
+		dirty = try_flush_caps(inode, &flush_tid);
 		if (dirty)
 			err = wait_event_interruptible(ci->i_cap_wq,
 				       caps_are_flushed(inode, flush_tid));
@@ -1867,10 +1911,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(inode->i_sb)->mdsc;
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_caps_dirty(ci))
 			__cap_delay_requeue_front(mdsc, ci);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 	return err;
 }
@@ -1893,7 +1937,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
 		struct inode *inode = &ci->vfs_inode;
 		struct ceph_cap *cap;
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		cap = ci->i_auth_cap;
 		if (cap && cap->session == session) {
 			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
@@ -1903,7 +1947,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
 			pr_err("%p auth cap %p not mds%d ???\n", inode,
 			       cap, session->s_mds);
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 }
 
@@ -1920,7 +1964,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 		struct ceph_cap *cap;
 		int delayed = 0;
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		cap = ci->i_auth_cap;
 		if (cap && cap->session == session) {
 			dout("kick_flushing_caps %p cap %p %s\n", inode,
@@ -1931,14 +1975,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 					     cap->issued | cap->implemented,
 					     ci->i_flushing_caps, NULL);
 			if (delayed) {
-				spin_lock(&inode->i_lock);
+				spin_lock(&ci->i_ceph_lock);
 				__cap_delay_requeue(mdsc, ci);
-				spin_unlock(&inode->i_lock);
+				spin_unlock(&ci->i_ceph_lock);
 			}
 		} else {
 			pr_err("%p auth cap %p not mds%d ???\n", inode,
 			       cap, session->s_mds);
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 		}
 	}
 }
@@ -1951,24 +1995,31 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 	struct ceph_cap *cap;
 	int delayed = 0;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	cap = ci->i_auth_cap;
 	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
 	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
 	__ceph_flush_snaps(ci, &session, 1);
+
 	if (ci->i_flushing_caps) {
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_move_tail(&ci->i_flushing_item,
+			       &cap->session->s_cap_flushing);
+		spin_unlock(&mdsc->cap_dirty_lock);
+
 		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
 				     __ceph_caps_used(ci),
 				     __ceph_caps_wanted(ci),
 				     cap->issued | cap->implemented,
 				     ci->i_flushing_caps, NULL);
 		if (delayed) {
-			spin_lock(&inode->i_lock);
+			spin_lock(&ci->i_ceph_lock);
 			__cap_delay_requeue(mdsc, ci);
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 		}
 	} else {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 }
 
@@ -1977,7 +2028,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
  * Take references to capabilities we hold, so that we don't release
  * them to the MDS prematurely.
  *
- * Protected by i_lock.
+ * Protected by i_ceph_lock.
  */
 static void __take_cap_refs(struct ceph_inode_info *ci, int got)
 {
@@ -1990,11 +2041,11 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
 	if (got & CEPH_CAP_FILE_WR)
 		ci->i_wr_ref++;
 	if (got & CEPH_CAP_FILE_BUFFER) {
-		if (ci->i_wrbuffer_ref == 0)
-			igrab(&ci->vfs_inode);
-		ci->i_wrbuffer_ref++;
-		dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
-		     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+		if (ci->i_wb_ref == 0)
+			ihold(&ci->vfs_inode);
+		ci->i_wb_ref++;
+		dout("__take_cap_refs %p wb %d -> %d (?)\n",
+		     &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
 	}
 }
 
@@ -2015,7 +2066,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 
 	dout("get_cap_refs %p need %s want %s\n", inode,
 	     ceph_cap_string(need), ceph_cap_string(want));
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 
 	/* make sure file is actually open */
 	file_wanted = __ceph_caps_file_wanted(ci);
@@ -2027,11 +2078,20 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 		goto out;
 	}
 
-	if (need & CEPH_CAP_FILE_WR) {
+	/* finish pending truncate */
+	while (ci->i_truncate_pending) {
+		spin_unlock(&ci->i_ceph_lock);
+		__ceph_do_pending_vmtruncate(inode);
+		spin_lock(&ci->i_ceph_lock);
+	}
+
+	have = __ceph_caps_issued(ci, &implemented);
+
+	if (have & need & CEPH_CAP_FILE_WR) {
 		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
 			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
 			     inode, endoff, ci->i_max_size);
-			if (endoff > ci->i_wanted_max_size) {
+			if (endoff > ci->i_requested_max_size) {
 				*check_max = 1;
 				ret = 1;
 			}
@@ -2046,13 +2106,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 			goto out;
 		}
 	}
-	have = __ceph_caps_issued(ci, &implemented);
-
-	/*
-	 * disallow writes while a truncate is pending
-	 */
-	if (ci->i_truncate_pending)
-		have &= ~CEPH_CAP_FILE_WR;
 
 	if ((have & need) == need) {
 		/*
@@ -2076,7 +2129,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	dout("get_cap_refs %p ret %d got %s\n", inode,
 	     ret, ceph_cap_string(*got));
 	return ret;
@@ -2093,16 +2146,19 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 	int check = 0;
 
 	/* do we need to explicitly request a larger max_size? */
-	spin_lock(&inode->i_lock);
-	if ((endoff >= ci->i_max_size ||
-	     endoff > (inode->i_size << 1)) &&
-	    endoff > ci->i_wanted_max_size) {
+	spin_lock(&ci->i_ceph_lock);
+	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
 		dout("write %p at large endoff %llu, req max_size\n",
 		     inode, endoff);
 		ci->i_wanted_max_size = endoff;
-		check = 1;
 	}
-	spin_unlock(&inode->i_lock);
+	/* duplicate ceph_check_caps()'s logic */
+	if (ci->i_auth_cap &&
+	    (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
+	    ci->i_wanted_max_size > ci->i_max_size &&
+	    ci->i_wanted_max_size > ci->i_requested_max_size)
+		check = 1;
+	spin_unlock(&ci->i_ceph_lock);
 	if (check)
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
@@ -2139,9 +2195,9 @@ retry:
  */
 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 {
-	spin_lock(&ci->vfs_inode.i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	__take_cap_refs(ci, caps);
-	spin_unlock(&ci->vfs_inode.i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 }
 
 /*
@@ -2159,7 +2215,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 	int last = 0, put = 0, flushsnaps = 0, wake = 0;
 	struct ceph_cap_snap *capsnap;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (had & CEPH_CAP_PIN)
 		--ci->i_pin_ref;
 	if (had & CEPH_CAP_FILE_RD)
@@ -2169,12 +2225,12 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 		if (--ci->i_rdcache_ref == 0)
 			last++;
 	if (had & CEPH_CAP_FILE_BUFFER) {
-		if (--ci->i_wrbuffer_ref == 0) {
+		if (--ci->i_wb_ref == 0) {
 			last++;
 			put++;
 		}
-		dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
-		     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+		dout("put_cap_refs %p wb %d -> %d (?)\n",
+		     inode, ci->i_wb_ref+1, ci->i_wb_ref);
 	}
 	if (had & CEPH_CAP_FILE_WR)
 		if (--ci->i_wr_ref == 0) {
@@ -2192,7 +2248,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 				}
 			}
 		}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
 	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
 	     last ? " last" : "", put ? " put" : "");
@@ -2224,7 +2280,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 	int found = 0;
 	struct ceph_cap_snap *capsnap = NULL;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	ci->i_wrbuffer_ref -= nr;
 	last = !ci->i_wrbuffer_ref;
 
@@ -2273,7 +2329,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 		}
 	}
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
 	if (last) {
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -2287,41 +2343,88 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 }
 
 /*
+ * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
+ */
+static void invalidate_aliases(struct inode *inode)
+{
+	struct dentry *dn, *prev = NULL;
+
+	dout("invalidate_aliases inode %p\n", inode);
+	d_prune_aliases(inode);
+	/*
+	 * For non-directory inode, d_find_alias() only returns
+	 * hashed dentry. After calling d_invalidate(), the
+	 * dentry becomes unhashed.
+	 *
+	 * For directory inode, d_find_alias() can return
+	 * unhashed dentry. But directory inode should have
+	 * one alias at most.
+	 */
+	while ((dn = d_find_alias(inode))) {
+		if (dn == prev) {
+			dput(dn);
+			break;
+		}
+		d_invalidate(dn);
+		if (prev)
+			dput(prev);
+		prev = dn;
+	}
+	if (prev)
+		dput(prev);
+}
+
+/*
  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
  * actually be a revocation if it specifies a smaller cap set.)
  *
- * caller holds s_mutex and i_lock, we drop both.
- *
- * return value:
- *  0 - ok
- *  1 - check_caps on auth cap only (writeback)
- *  2 - check_caps (ack revoke)
+ * caller holds s_mutex and i_ceph_lock, we drop both.
  */
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct ceph_mds_client *mdsc,
+			     struct inode *inode, struct ceph_mds_caps *grant,
+			     void *snaptrace, int snaptrace_len,
+			     struct ceph_buffer *xattr_buf,
 			     struct ceph_mds_session *session,
-			     struct ceph_cap *cap,
-			     struct ceph_buffer *xattr_buf)
-		__releases(inode->i_lock)
+			     struct ceph_cap *cap, int issued)
+	__releases(ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds = session->s_mds;
 	int seq = le32_to_cpu(grant->seq);
 	int newcaps = le32_to_cpu(grant->caps);
-	int issued, implemented, used, wanted, dirty;
+	int used, wanted, dirty;
 	u64 size = le64_to_cpu(grant->size);
 	u64 max_size = le64_to_cpu(grant->max_size);
 	struct timespec mtime, atime, ctime;
 	int check_caps = 0;
-	int wake = 0;
-	int writeback = 0;
-	int revoked_rdcache = 0;
-	int queue_invalidate = 0;
+	bool wake = 0;
+	bool writeback = 0;
+	bool queue_trunc = 0;
+	bool queue_invalidate = 0;
+	bool queue_revalidate = 0;
+	bool deleted_inode = 0;
 
 	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
 	     inode, cap, mds, seq, ceph_cap_string(newcaps));
 	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
 		inode->i_size);
 
+
+	/*
+	 * auth mds of the inode changed. we received the cap export message,
+	 * but still haven't received the cap import message. handle_cap_export
+	 * updated the new auth MDS' cap.
+	 *
+	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+	 * that was sent before the cap import message. So don't remove caps.
+	 */
+	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+		WARN_ON(cap != ci->i_auth_cap);
+		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+		seq = cap->seq;
+		newcaps |= cap->issued;
+	}
+
 	/*
 	 * If CACHE is being revoked, and we have no dirty buffers,
 	 * try to invalidate (once).  (If there are dirty buffers, we
@@ -2330,9 +2433,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
 	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
 	    !ci->i_wrbuffer_ref) {
-		if (try_nonblocking_invalidate(inode) == 0) {
-			revoked_rdcache = 1;
-		} else {
+		if (try_nonblocking_invalidate(inode)) {
 			/* there were locked pages.. invalidate later
 			   in a separate thread. */
 			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
@@ -2340,27 +2441,33 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
 			}
 		}
+
+		ceph_fscache_invalidate(inode);
 	}
 
 	/* side effects now are allowed */
-
-	issued = __ceph_caps_issued(ci, &implemented);
-	issued |= implemented | __ceph_caps_dirty(ci);
-
 	cap->cap_gen = session->s_cap_gen;
+	cap->seq = seq;
 
 	__check_cap_issue(ci, cap, newcaps);
 
-	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(grant->mode);
-		inode->i_uid = le32_to_cpu(grant->uid);
-		inode->i_gid = le32_to_cpu(grant->gid);
+		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
-		     inode->i_uid, inode->i_gid);
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
-	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
-		inode->i_nlink = le32_to_cpu(grant->nlink);
+	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	    (issued & CEPH_CAP_LINK_EXCL) == 0) {
+		set_nlink(inode, le32_to_cpu(grant->nlink));
+		if (inode->i_nlink == 0 &&
+		    (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+			deleted_inode = 1;
+	}
 
 	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
 		int len = le32_to_cpu(grant->xattr_len);
@@ -2373,29 +2480,44 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 				ceph_buffer_put(ci->i_xattrs.blob);
 			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
 			ci->i_xattrs.version = version;
+			ceph_forget_all_cached_acls(inode);
 		}
 	}
 
-	/* size/ctime/mtime/atime? */
-	ceph_fill_file_size(inode, issued,
-			    le32_to_cpu(grant->truncate_seq),
-			    le64_to_cpu(grant->truncate_size), size);
-	ceph_decode_timespec(&mtime, &grant->mtime);
-	ceph_decode_timespec(&atime, &grant->atime);
-	ceph_decode_timespec(&ctime, &grant->ctime);
-	ceph_fill_file_time(inode, issued,
-			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
-			    &atime);
-
-	/* max size increase? */
-	if (max_size != ci->i_max_size) {
-		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
-		ci->i_max_size = max_size;
-		if (max_size >= ci->i_wanted_max_size) {
-			ci->i_wanted_max_size = 0;  /* reset */
-			ci->i_requested_max_size = 0;
+	/* Do we need to revalidate our fscache cookie. Don't bother on the
+	 * first cache cap as we already validate at cookie creation time. */
+	if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
+		queue_revalidate = 1;
+
+	if (newcaps & CEPH_CAP_ANY_RD) {
+		/* ctime/mtime/atime? */
+		ceph_decode_timespec(&mtime, &grant->mtime);
+		ceph_decode_timespec(&atime, &grant->atime);
+		ceph_decode_timespec(&ctime, &grant->ctime);
+		ceph_fill_file_time(inode, issued,
+				    le32_to_cpu(grant->time_warp_seq),
+				    &ctime, &mtime, &atime);
+	}
+
+	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+		/* file layout may have changed */
+		ci->i_layout = grant->layout;
+		/* size/truncate_seq? */
+		queue_trunc = ceph_fill_file_size(inode, issued,
+					le32_to_cpu(grant->truncate_seq),
+					le64_to_cpu(grant->truncate_size),
+					size);
+		/* max size increase? */
+		if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+			dout("max_size %lld -> %llu\n",
+			     ci->i_max_size, max_size);
+			ci->i_max_size = max_size;
+			if (max_size >= ci->i_wanted_max_size) {
+				ci->i_wanted_max_size = 0;  /* reset */
+				ci->i_requested_max_size = 0;
+			}
+			wake = 1;
 		}
-		wake = 1;
 	}
 
 	/* check cap bits */
@@ -2410,14 +2532,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		dout("mds wanted %s -> %s\n",
 		     ceph_cap_string(le32_to_cpu(grant->wanted)),
 		     ceph_cap_string(wanted));
-		grant->wanted = cpu_to_le32(wanted);
+		/* imported cap may not have correct mds_wanted */
+		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+			check_caps = 1;
 	}
 
-	cap->seq = seq;
-
-	/* file layout may have changed */
-	ci->i_layout = grant->layout;
-
 	/* revocation, grant, or no-op? */
 	if (cap->issued & ~newcaps) {
 		int revoking = cap->issued & ~newcaps;
@@ -2444,6 +2563,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	} else {
 		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
 		     ceph_cap_string(newcaps));
+		/* non-auth MDS is revoking the newly grant caps ? */
+		if (cap == ci->i_auth_cap &&
+		    __ceph_caps_revoking_other(ci, cap, newcaps))
+		    check_caps = 2;
+
 		cap->issued = newcaps;
 		cap->implemented |= newcaps; /* add bits only, to
 					      * avoid stepping on a
@@ -2452,7 +2576,25 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	}
 	BUG_ON(cap->issued & ~cap->implemented);
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+		down_write(&mdsc->snap_rwsem);
+		ceph_update_snap_trace(mdsc, snaptrace,
+				       snaptrace + snaptrace_len, false);
+		downgrade_write(&mdsc->snap_rwsem);
+		kick_flushing_inode_caps(mdsc, session, inode);
+		up_read(&mdsc->snap_rwsem);
+		if (newcaps & ~issued)
+			wake = 1;
+	}
+
+	if (queue_trunc) {
+		ceph_queue_vmtruncate(inode);
+		ceph_queue_revalidate(inode);
+	} else if (queue_revalidate)
+		ceph_queue_revalidate(inode);
+
 	if (writeback)
 		/*
 		 * queue inode for writeback: we can't actually call
@@ -2462,6 +2604,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		ceph_queue_writeback(inode);
 	if (queue_invalidate)
 		ceph_queue_invalidate(inode);
+	if (deleted_inode)
+		invalidate_aliases(inode);
 	if (wake)
 		wake_up_all(&ci->i_cap_wq);
 
@@ -2482,7 +2626,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 				 struct ceph_mds_caps *m,
 				 struct ceph_mds_session *session,
 				 struct ceph_cap *cap)
-	__releases(inode->i_lock)
+	__releases(ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -2538,7 +2682,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	wake_up_all(&ci->i_cap_wq);
 
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	if (drop)
 		iput(inode);
 }
@@ -2561,7 +2705,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
 	     inode, ci, session->s_mds, follows);
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 		if (capsnap->follows == follows) {
 			if (capsnap->flush_tid != flush_tid) {
@@ -2584,7 +2728,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 			     capsnap, capsnap->follows);
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	if (drop)
 		iput(inode);
 }
@@ -2597,7 +2741,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 static void handle_cap_trunc(struct inode *inode,
 			     struct ceph_mds_caps *trunc,
 			     struct ceph_mds_session *session)
-	__releases(inode->i_lock)
+	__releases(ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds = session->s_mds;
@@ -2616,10 +2760,12 @@ static void handle_cap_trunc(struct inode *inode,
 	     inode, mds, seq, truncate_size, truncate_seq);
 	queue_trunc = ceph_fill_file_size(inode, issued,
 					  truncate_seq, truncate_size, size);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
-	if (queue_trunc)
+	if (queue_trunc) {
 		ceph_queue_vmtruncate(inode);
+		ceph_fscache_invalidate(inode);
+	}
 }
 
 /*
@@ -2631,101 +2777,200 @@ static void handle_cap_trunc(struct inode *inode,
  * caller holds s_mutex
  */
 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
-			      struct ceph_mds_session *session,
-			      int *open_target_sessions)
+			      struct ceph_mds_cap_peer *ph,
+			      struct ceph_mds_session *session)
 {
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *tsession = NULL;
+	struct ceph_cap *cap, *tcap, *new_cap = NULL;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int mds = session->s_mds;
+	u64 t_cap_id;
 	unsigned mseq = le32_to_cpu(ex->migrate_seq);
-	struct ceph_cap *cap = NULL, *t;
-	struct rb_node *p;
-	int remember = 1;
+	unsigned t_seq, t_mseq;
+	int target, issued;
+	int mds = session->s_mds;
 
-	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
-	     inode, ci, mds, mseq);
+	if (ph) {
+		t_cap_id = le64_to_cpu(ph->cap_id);
+		t_seq = le32_to_cpu(ph->seq);
+		t_mseq = le32_to_cpu(ph->mseq);
+		target = le32_to_cpu(ph->mds);
+	} else {
+		t_cap_id = t_seq = t_mseq = 0;
+		target = -1;
+	}
 
-	spin_lock(&inode->i_lock);
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+	     inode, ci, mds, mseq, target);
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
+		goto out_unlock;
 
-	/* make sure we haven't seen a higher mseq */
-	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
-		t = rb_entry(p, struct ceph_cap, ci_node);
-		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
-			dout(" higher mseq on cap from mds%d\n",
-			     t->session->s_mds);
-			remember = 0;
+	if (target < 0) {
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
+	}
+
+	/*
+	 * now we know we haven't received the cap import message yet
+	 * because the exported cap still exist.
+	 */
+
+	issued = cap->issued;
+	WARN_ON(issued != cap->implemented);
+
+	tcap = __get_cap_for_mds(ci, target);
+	if (tcap) {
+		/* already have caps from the target */
+		if (tcap->cap_id != t_cap_id ||
+		    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+			dout(" updating import cap %p mds%d\n", tcap, target);
+			tcap->cap_id = t_cap_id;
+			tcap->seq = t_seq - 1;
+			tcap->issue_seq = t_seq - 1;
+			tcap->mseq = t_mseq;
+			tcap->issued |= issued;
+			tcap->implemented |= issued;
+			if (cap == ci->i_auth_cap)
+				ci->i_auth_cap = tcap;
+			if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+				spin_lock(&mdsc->cap_dirty_lock);
+				list_move_tail(&ci->i_flushing_item,
+					       &tcap->session->s_cap_flushing);
+				spin_unlock(&mdsc->cap_dirty_lock);
+			}
 		}
-		if (t->session->s_mds == mds)
-			cap = t;
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
+	} else if (tsession) {
+		/* add placeholder for the export tagert */
+		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+		__ceph_remove_cap(cap, false);
+		goto out_unlock;
 	}
 
-	if (cap) {
-		if (remember) {
-			/* make note */
-			ci->i_cap_exporting_mds = mds;
-			ci->i_cap_exporting_mseq = mseq;
-			ci->i_cap_exporting_issued = cap->issued;
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&session->s_mutex);
 
-			/*
-			 * make sure we have open sessions with all possible
-			 * export targets, so that we get the matching IMPORT
-			 */
-			*open_target_sessions = 1;
+	/* open target session */
+	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+	if (!IS_ERR(tsession)) {
+		if (mds > target) {
+			mutex_lock(&session->s_mutex);
+			mutex_lock_nested(&tsession->s_mutex,
+					  SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&tsession->s_mutex);
+			mutex_lock_nested(&session->s_mutex,
+					  SINGLE_DEPTH_NESTING);
 		}
-		__ceph_remove_cap(cap);
+		ceph_add_cap_releases(mdsc, tsession);
+		new_cap = ceph_get_cap(mdsc, NULL);
+	} else {
+		WARN_ON(1);
+		tsession = NULL;
+		target = -1;
 	}
-	/* else, we already released it */
+	goto retry;
 
-	spin_unlock(&inode->i_lock);
+out_unlock:
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&session->s_mutex);
+	if (tsession) {
+		mutex_unlock(&tsession->s_mutex);
+		ceph_put_mds_session(tsession);
+	}
+	if (new_cap)
+		ceph_put_cap(mdsc, new_cap);
 }
 
 /*
- * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
- * clean them up.
+ * Handle cap IMPORT.
  *
- * caller holds s_mutex.
+ * caller holds s_mutex. acquires i_ceph_lock
  */
 static void handle_cap_import(struct ceph_mds_client *mdsc,
 			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_cap_peer *ph,
 			      struct ceph_mds_session *session,
-			      void *snaptrace, int snaptrace_len)
+			      struct ceph_cap **target_cap, int *old_issued)
+	__acquires(ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap, *ocap, *new_cap = NULL;
 	int mds = session->s_mds;
-	unsigned issued = le32_to_cpu(im->caps);
+	int issued;
+	unsigned caps = le32_to_cpu(im->caps);
 	unsigned wanted = le32_to_cpu(im->wanted);
 	unsigned seq = le32_to_cpu(im->seq);
 	unsigned mseq = le32_to_cpu(im->migrate_seq);
 	u64 realmino = le64_to_cpu(im->realm);
 	u64 cap_id = le64_to_cpu(im->cap_id);
+	u64 p_cap_id;
+	int peer;
 
-	if (ci->i_cap_exporting_mds >= 0 &&
-	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
-		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
-		     " - cleared exporting from mds%d\n",
-		     inode, ci, mds, mseq,
-		     ci->i_cap_exporting_mds);
-		ci->i_cap_exporting_issued = 0;
-		ci->i_cap_exporting_mseq = 0;
-		ci->i_cap_exporting_mds = -1;
+	if (ph) {
+		p_cap_id = le64_to_cpu(ph->cap_id);
+		peer = le32_to_cpu(ph->mds);
 	} else {
-		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
-		     inode, ci, mds, mseq);
+		p_cap_id = 0;
+		peer = -1;
+	}
+
+	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
+	     inode, ci, mds, mseq, peer);
+
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		if (!new_cap) {
+			spin_unlock(&ci->i_ceph_lock);
+			new_cap = ceph_get_cap(mdsc, NULL);
+			goto retry;
+		}
+		cap = new_cap;
+	} else {
+		if (new_cap) {
+			ceph_put_cap(mdsc, new_cap);
+			new_cap = NULL;
+		}
 	}
 
-	down_write(&mdsc->snap_rwsem);
-	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
-			       false);
-	downgrade_write(&mdsc->snap_rwsem);
-	ceph_add_cap(inode, session, cap_id, -1,
-		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
-		     NULL /* no caps context */);
-	kick_flushing_inode_caps(mdsc, session, inode);
-	up_read(&mdsc->snap_rwsem);
+	__ceph_caps_issued(ci, &issued);
+	issued |= __ceph_caps_dirty(ci);
+
+	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+	if (ocap && ocap->cap_id == p_cap_id) {
+		dout(" remove export cap %p mds%d flags %d\n",
+		     ocap, peer, ph->flags);
+		if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+		    (ocap->seq != le32_to_cpu(ph->seq) ||
+		     ocap->mseq != le32_to_cpu(ph->mseq))) {
+			pr_err("handle_cap_import: mismatched seq/mseq: "
+			       "ino (%llx.%llx) mds%d seq %d mseq %d "
+			       "importer mds%d has peer seq %d mseq %d\n",
+			       ceph_vinop(inode), peer, ocap->seq,
+			       ocap->mseq, mds, le32_to_cpu(ph->seq),
+			       le32_to_cpu(ph->mseq));
+		}
+		__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+	}
 
 	/* make sure we re-request max_size, if necessary */
-	spin_lock(&inode->i_lock);
+	ci->i_wanted_max_size = 0;
 	ci->i_requested_max_size = 0;
-	spin_unlock(&inode->i_lock);
+
+	*old_issued = issued;
+	*target_cap = cap;
 }
 
 /*
@@ -2740,10 +2985,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
+	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
+	struct ceph_mds_cap_peer *peer = NULL;
 	int mds = session->s_mds;
-	int op;
+	int op, issued;
 	u32 seq, mseq;
 	struct ceph_vino vino;
 	u64 cap_id;
@@ -2752,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	void *snaptrace;
 	size_t snaptrace_len;
 	void *flock;
+	void *end;
 	u32 flock_len;
-	int open_target_sessions = 0;
 
 	dout("handle_caps from mds%d\n", mds);
 
 	/* decode */
+	end = msg->front.iov_base + msg->front.iov_len;
 	tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
@@ -2775,32 +3023,50 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	snaptrace_len = le32_to_cpu(h->snap_trace_len);
 
 	if (le16_to_cpu(msg->hdr.version) >= 2) {
-		void *p, *end;
-
-		p = snaptrace + snaptrace_len;
-		end = msg->front.iov_base + msg->front.iov_len;
+		void *p = snaptrace + snaptrace_len;
 		ceph_decode_32_safe(&p, end, flock_len, bad);
+		if (p + flock_len > end)
+			goto bad;
 		flock = p;
 	} else {
 		flock = NULL;
 		flock_len = 0;
 	}
 
+	if (le16_to_cpu(msg->hdr.version) >= 3) {
+		if (op == CEPH_CAP_OP_IMPORT) {
+			void *p = flock + flock_len;
+			if (p + sizeof(*peer) > end)
+				goto bad;
+			peer = p;
+		} else if (op == CEPH_CAP_OP_EXPORT) {
+			/* recorded in unused fields */
+			peer = (void *)&h->size;
+		}
+	}
+
 	mutex_lock(&session->s_mutex);
 	session->s_seq++;
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 	     (unsigned)seq);
 
+	if (op == CEPH_CAP_OP_IMPORT)
+		ceph_add_cap_releases(mdsc, session);
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
+	ci = ceph_inode(inode);
 	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
 	     vino.snap, inode);
 	if (!inode) {
 		dout(" i don't have ino %llx\n", vino.ino);
 
-		if (op == CEPH_CAP_OP_IMPORT)
+		if (op == CEPH_CAP_OP_IMPORT) {
+			spin_lock(&session->s_cap_lock);
 			__queue_cap_release(session, vino.ino, cap_id,
 					    mseq, seq);
+			spin_unlock(&session->s_cap_lock);
+		}
 		goto flush_cap_releases;
 	}
 
@@ -2811,31 +3077,35 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		goto done;
 
 	case CEPH_CAP_OP_EXPORT:
-		handle_cap_export(inode, h, session, &open_target_sessions);
-		goto done;
+		handle_cap_export(inode, h, peer, session);
+		goto done_unlocked;
 
 	case CEPH_CAP_OP_IMPORT:
-		handle_cap_import(mdsc, inode, h, session,
-				  snaptrace, snaptrace_len);
-		ceph_check_caps(ceph_inode(inode), 0, session);
+		handle_cap_import(mdsc, inode, h, peer, session,
+				  &cap, &issued);
+		handle_cap_grant(mdsc, inode, h,  snaptrace, snaptrace_len,
+				 msg->middle, session, cap, issued);
 		goto done_unlocked;
 	}
 
 	/* the rest require a cap */
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	cap = __get_cap_for_mds(ceph_inode(inode), mds);
 	if (!cap) {
 		dout(" no cap on %p ino %llx.%llx from mds%d\n",
 		     inode, ceph_ino(inode), ceph_snap(inode), mds);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		goto flush_cap_releases;
 	}
 
-	/* note that each of these drops i_lock for us */
+	/* note that each of these drops i_ceph_lock for us */
 	switch (op) {
 	case CEPH_CAP_OP_REVOKE:
 	case CEPH_CAP_OP_GRANT:
-		handle_cap_grant(inode, h, session, cap, msg->middle);
+		__ceph_caps_issued(ci, &issued);
+		issued |= __ceph_caps_dirty(ci);
+		handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
+				 session, cap, issued);
 		goto done_unlocked;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
@@ -2847,7 +3117,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		break;
 
 	default:
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
 		       ceph_cap_op_name(op));
 	}
@@ -2868,8 +3138,6 @@ done:
 done_unlocked:
 	if (inode)
 		iput(inode);
-	if (open_target_sessions)
-		ceph_mdsc_open_export_target_sessions(mdsc, session);
 	return;
 
 bad:
@@ -2910,47 +3178,24 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
  */
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-	struct ceph_inode_info *ci, *nci = NULL;
-	struct inode *inode, *ninode = NULL;
-	struct list_head *p, *n;
+	struct ceph_inode_info *ci;
+	struct inode *inode;
 
 	dout("flush_dirty_caps\n");
 	spin_lock(&mdsc->cap_dirty_lock);
-	list_for_each_safe(p, n, &mdsc->cap_dirty) {
-		if (nci) {
-			ci = nci;
-			inode = ninode;
-			ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
-			dout("flush_dirty_caps inode %p (was next inode)\n",
-			     inode);
-		} else {
-			ci = list_entry(p, struct ceph_inode_info,
-					i_dirty_item);
-			inode = igrab(&ci->vfs_inode);
-			BUG_ON(!inode);
-			dout("flush_dirty_caps inode %p\n", inode);
-		}
-		if (n != &mdsc->cap_dirty) {
-			nci = list_entry(n, struct ceph_inode_info,
-					 i_dirty_item);
-			ninode = igrab(&nci->vfs_inode);
-			BUG_ON(!ninode);
-			nci->i_ceph_flags |= CEPH_I_NOFLUSH;
-			dout("flush_dirty_caps next inode %p, noflush\n",
-			     ninode);
-		} else {
-			nci = NULL;
-			ninode = NULL;
-		}
+	while (!list_empty(&mdsc->cap_dirty)) {
+		ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+				      i_dirty_item);
+		inode = &ci->vfs_inode;
+		ihold(inode);
+		dout("flush_dirty_caps %p\n", inode);
 		spin_unlock(&mdsc->cap_dirty_lock);
-		if (inode) {
-			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
-					NULL);
-			iput(inode);
-		}
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+		iput(inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
+	dout("flush_dirty_caps done\n");
 }
 
 /*
@@ -2963,13 +3208,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
 	struct inode *inode = &ci->vfs_inode;
 	int last = 0;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
 	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
 	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
 	if (--ci->i_nr_by_mode[fmode] == 0)
 		last++;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
 	if (last && ci->i_vino.snap == CEPH_NOSNAP)
 		ceph_check_caps(ci, 0, NULL);
@@ -2992,7 +3237,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 	int used, dirty;
 	int ret = 0;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	used = __ceph_caps_used(ci);
 	dirty = __ceph_caps_dirty(ci);
 
@@ -3010,21 +3255,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 		     (cap->issued & unless) == 0)) {
 			if ((cap->issued & drop) &&
 			    (cap->issued & unless) == 0) {
-				dout("encode_inode_release %p cap %p %s -> "
-				     "%s\n", inode, cap,
+				int wanted = __ceph_caps_wanted(ci);
+				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+					wanted |= cap->mds_wanted;
+				dout("encode_inode_release %p cap %p "
+				     "%s -> %s, wanted %s -> %s\n", inode, cap,
 				     ceph_cap_string(cap->issued),
-				     ceph_cap_string(cap->issued & ~drop));
+				     ceph_cap_string(cap->issued & ~drop),
+				     ceph_cap_string(cap->mds_wanted),
+				     ceph_cap_string(wanted));
+
 				cap->issued &= ~drop;
 				cap->implemented &= ~drop;
-				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
-					int wanted = __ceph_caps_wanted(ci);
-					dout("  wanted %s -> %s (act %s)\n",
-					     ceph_cap_string(cap->mds_wanted),
-					     ceph_cap_string(cap->mds_wanted &
-							     ~wanted),
-					     ceph_cap_string(wanted));
-					cap->mds_wanted &= wanted;
-				}
+				cap->mds_wanted = wanted;
 			} else {
 				dout("encode_inode_release %p cap %p %s"
 				     " (force)\n", inode, cap,
@@ -3036,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 			rel->seq = cpu_to_le32(cap->seq);
 			rel->issue_seq = cpu_to_le32(cap->issue_seq),
 			rel->mseq = cpu_to_le32(cap->mseq);
-			rel->caps = cpu_to_le32(cap->issued);
+			rel->caps = cpu_to_le32(cap->implemented);
 			rel->wanted = cpu_to_le32(cap->mds_wanted);
 			rel->dname_len = 0;
 			rel->dname_seq = 0;
@@ -3047,7 +3290,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 			     inode, cap, ceph_cap_string(cap->issued));
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return ret;
 }
 
@@ -3062,7 +3305,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 
 	/*
 	 * force an record for the directory caps if we have a dentry lease.
-	 * this is racy (can't take i_lock and d_lock together), but it
+	 * this is racy (can't take i_ceph_lock and d_lock together), but it
 	 * doesn't have to be perfect; the mds will revoke anything we don't
 	 * release.
 	 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 0dba6915712..5a743ac141a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -71,9 +71,9 @@ static int mdsc_show(struct seq_file *s, void *p)
 		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
 
 		if (req->r_got_unsafe)
-			seq_printf(s, "\t(unsafe)");
+			seq_puts(s, "\t(unsafe)");
 		else
-			seq_printf(s, "\t");
+			seq_puts(s, "\t");
 
 		if (req->r_inode) {
 			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		} else if (req->r_path1) {
 			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
 				   req->r_path1);
+		} else {
+			seq_printf(s, " #%llx", req->r_ino1.ino);
 		}
 
 		if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 				path = NULL;
 			spin_lock(&req->r_old_dentry->d_lock);
 			seq_printf(s, " #%llx/%.*s (%s)",
-			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
+				   req->r_old_dentry_dir ?
+				   ceph_ino(req->r_old_dentry_dir) : 0,
 				   req->r_old_dentry->d_name.len,
 				   req->r_old_dentry->d_name.name,
 				   path ? path : "");
@@ -116,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 				seq_printf(s, " %s", req->r_path2);
 		}
 
-		seq_printf(s, "\n");
+		seq_puts(s, "\n");
 	}
 	mutex_unlock(&mdsc->mutex);
 
@@ -201,6 +204,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	int err = -ENOMEM;
 
 	dout("ceph_fs_debugfs_init\n");
+	BUG_ON(!fsc->client->debugfs_dir);
 	fsc->debugfs_congestion_kb =
 		debugfs_create_file("writeback_congestion_kb",
 				    0600,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601a..c29d6ae6887 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry)
 	if (dentry->d_fsdata)
 		return 0;
 
-	if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-	    ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
-		d_set_d_op(dentry, &ceph_dentry_ops);
-	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
-	else
-		d_set_d_op(dentry, &ceph_snap_dentry_ops);
-
 	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
 	if (!di)
 		return -ENOMEM;          /* oh well */
@@ -58,16 +50,41 @@ int ceph_init_dentry(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		goto out_unlock;
 	}
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+		d_set_d_op(dentry, &ceph_dentry_ops);
+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
+	else
+		d_set_d_op(dentry, &ceph_snap_dentry_ops);
+
 	di->dentry = dentry;
 	di->lease_session = NULL;
-	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
+	/* avoid reordering d_fsdata setup so that the check above is safe */
+	smp_mb();
+	dentry->d_fsdata = di;
 	ceph_dentry_lru_add(dentry);
 out_unlock:
 	spin_unlock(&dentry->d_lock);
 	return 0;
 }
 
+struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
+{
+	struct inode *inode = NULL;
+
+	if (!dentry)
+		return NULL;
+
+	spin_lock(&dentry->d_lock);
+	if (!IS_ROOT(dentry)) {
+		inode = dentry->d_parent->d_inode;
+		ihold(inode);
+	}
+	spin_unlock(&dentry->d_lock);
+	return inode;
+}
 
 
 /*
@@ -83,6 +100,14 @@ static unsigned fpos_off(loff_t p)
 	return p & 0xffffffff;
 }
 
+static int fpos_cmp(loff_t l, loff_t r)
+{
+	int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+	if (v)
+		return v;
+	return (int)(fpos_off(l) - fpos_off(r));
+}
+
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
@@ -90,15 +115,15 @@ static unsigned fpos_off(loff_t p)
  * falling back to a "normal" sync readdir if any dentries in the dir
  * are dropped.
  *
- * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * Complete dir indicates that we have all dentries in the dir.  It is
  * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
  * the MDS if/when the directory is modified).
  */
-static int __dcache_readdir(struct file *filp,
-			    void *dirent, filldir_t filldir)
+static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
+			    u32 shared_gen)
 {
-	struct ceph_file_info *fi = filp->private_data;
-	struct dentry *parent = filp->f_dentry;
+	struct ceph_file_info *fi = file->private_data;
+	struct dentry *parent = file->f_dentry;
 	struct inode *dir = parent->d_inode;
 	struct list_head *p;
 	struct dentry *dentry, *last;
@@ -109,14 +134,14 @@ static int __dcache_readdir(struct file *filp,
 	last = fi->dentry;
 	fi->dentry = NULL;
 
-	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
-	     last);
+	dout("__dcache_readdir %p v%u at %llu (last %p)\n",
+	     dir, shared_gen, ctx->pos, last);
 
 	spin_lock(&parent->d_lock);
 
 	/* start at beginning? */
-	if (filp->f_pos == 2 || last == NULL ||
-	    filp->f_pos < ceph_dentry(last)->offset) {
+	if (ctx->pos == 2 || last == NULL ||
+	    fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
 		if (list_empty(&parent->d_subdirs))
 			goto out_unlock;
 		p = parent->d_subdirs.prev;
@@ -133,18 +158,19 @@ more:
 		     d_unhashed(dentry) ? "!hashed" : "hashed",
 		     parent->d_subdirs.prev, parent->d_subdirs.next);
 		if (p == &parent->d_subdirs) {
-			fi->at_end = 1;
+			fi->flags |= CEPH_F_ATEND;
 			goto out_unlock;
 		}
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-		if (!d_unhashed(dentry) && dentry->d_inode &&
+		if (di->lease_shared_gen == shared_gen &&
+		    !d_unhashed(dentry) && dentry->d_inode &&
 		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
 		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
-		    filp->f_pos <= di->offset)
+		    fpos_cmp(ctx->pos, di->offset) <= 0)
 			break;
 		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
 		     dentry->d_name.len, dentry->d_name.name, di->offset,
-		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
 		     !dentry->d_inode ? " null" : "");
 		spin_unlock(&dentry->d_lock);
 		p = p->prev;
@@ -156,36 +182,34 @@ more:
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 
-	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	/* make sure a dentry wasn't dropped while we didn't have parent lock */
+	if (!ceph_dir_is_complete(dir)) {
+		dout(" lost dir complete on %p; falling back to mds\n", dir);
+		dput(dentry);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-	filp->f_pos = di->offset;
-	err = filldir(dirent, dentry->d_name.name,
-		      dentry->d_name.len, di->offset,
+	if (!dir_emit(ctx, dentry->d_name.name,
+		      dentry->d_name.len,
 		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
-		      dentry->d_inode->i_mode >> 12);
-
-	if (last) {
-		if (err < 0) {
+		      dentry->d_inode->i_mode >> 12)) {
+		if (last) {
 			/* remember our position */
 			fi->dentry = last;
-			fi->next_offset = di->offset;
-		} else {
-			dput(last);
+			fi->next_offset = fpos_off(di->offset);
 		}
+		dput(dentry);
+		return 0;
 	}
-	last = dentry;
 
-	if (err < 0)
-		goto out;
-
-	filp->f_pos++;
+	ctx->pos = di->offset + 1;
 
-	/* make sure a dentry wasn't dropped while we didn't have parent lock */
-	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
-		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-		err = -EAGAIN;
-		goto out;
-	}
+	if (last)
+		dput(last);
+	last = dentry;
 
 	spin_lock(&parent->d_lock);
 	p = p->prev;	/* advance to next dentry */
@@ -218,63 +242,64 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
 	return 0;
 }
 
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct ceph_file_info *fi = filp->private_data;
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	unsigned frag = fpos_frag(filp->f_pos);
-	int off = fpos_off(filp->f_pos);
+	unsigned frag = fpos_frag(ctx->pos);
+	int off = fpos_off(ctx->pos);
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
-	const int max_entries = fsc->mount_options->max_readdir;
-	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
-	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
-	if (fi->at_end)
+	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+	if (fi->flags & CEPH_F_ATEND)
 		return 0;
 
 	/* always start with . and .. */
-	if (filp->f_pos == 0) {
+	if (ctx->pos == 0) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
-		fi->dir_release_count = ci->i_release_count;
+		fi->dir_release_count = atomic_read(&ci->i_release_count);
 
 		dout("readdir off 0 -> '.'\n");
-		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+		if (!dir_emit(ctx, ".", 1, 
 			    ceph_translate_ino(inode->i_sb, inode->i_ino),
-			    inode->i_mode >> 12) < 0)
+			    inode->i_mode >> 12))
 			return 0;
-		filp->f_pos = 1;
+		ctx->pos = 1;
 		off = 1;
 	}
-	if (filp->f_pos == 1) {
-		ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
+	if (ctx->pos == 1) {
+		ino_t ino = parent_ino(file->f_dentry);
 		dout("readdir off 1 -> '..'\n");
-		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+		if (!dir_emit(ctx, "..", 2,
 			    ceph_translate_ino(inode->i_sb, ino),
-			    inode->i_mode >> 12) < 0)
+			    inode->i_mode >> 12))
 			return 0;
-		filp->f_pos = 2;
+		ctx->pos = 2;
 		off = 2;
 	}
 
 	/* can we use the dcache? */
-	spin_lock(&inode->i_lock);
-	if ((filp->f_pos == 2 || fi->dentry) &&
+	spin_lock(&ci->i_ceph_lock);
+	if ((ctx->pos == 2 || fi->dentry) &&
 	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
-	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+	    __ceph_dir_is_complete(ci) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
-		spin_unlock(&inode->i_lock);
-		err = __dcache_readdir(filp, dirent, filldir);
+		u32 shared_gen = ci->i_shared_gen;
+		spin_unlock(&ci->i_ceph_lock);
+		err = __dcache_readdir(file, ctx, shared_gen);
 		if (err != -EAGAIN)
 			return err;
+		frag = fpos_frag(ctx->pos);
+		off = fpos_off(ctx->pos);
 	} else {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 	if (fi->dentry) {
 		err = note_last_dentry(fi, fi->dentry->d_name.name,
@@ -300,16 +325,19 @@ more:
 			fi->last_readdir = NULL;
 		}
 
-		/* requery frag tree, as the frag topology may have changed */
-		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
 		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
 		     ceph_vinop(inode), frag, fi->last_name);
 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
 		if (IS_ERR(req))
 			return PTR_ERR(req);
-		req->r_inode = igrab(inode);
-		req->r_dentry = dget(filp->f_dentry);
+		err = ceph_alloc_readdir_reply_buffer(req, inode);
+		if (err) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		req->r_inode = inode;
+		ihold(inode);
+		req->r_dentry = dget(file->f_dentry);
 		/* hints to request -> mds selection code */
 		req->r_direct_mode = USE_AUTH_MDS;
 		req->r_direct_hash = ceph_frag_value(frag);
@@ -317,9 +345,6 @@ more:
 		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
-		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
-		req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
-		req->r_num_caps = max_entries + 1;
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 		if (err < 0) {
 			ceph_mdsc_put_request(req);
@@ -332,10 +357,21 @@ more:
 
 		if (!req->r_did_prepopulate) {
 			dout("readdir !did_prepopulate");
-			fi->dir_release_count--;    /* preclude I_COMPLETE */
+			/* preclude from marking dir complete */
+			fi->dir_release_count--;
 		}
 
 		/* note next offset and last dentry name */
+		rinfo = &req->r_reply_info;
+		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+			frag = le32_to_cpu(rinfo->dir_dir->frag);
+			if (ceph_frag_is_leftmost(frag))
+				fi->next_offset = 2;
+			else
+				fi->next_offset = 0;
+			off = fi->next_offset;
+		}
+		fi->frag = frag;
 		fi->offset = fi->next_offset;
 		fi->last_readdir = req;
 
@@ -347,7 +383,6 @@ more:
 			else
 				fi->next_offset = 0;
 		} else {
-			rinfo = &req->r_reply_info;
 			err = note_last_dentry(fi,
 				       rinfo->dir_dname[rinfo->dir_nr-1],
 				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
@@ -360,15 +395,16 @@ more:
 	rinfo = &fi->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
 	     rinfo->dir_nr, off, fi->offset);
-	while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
-		u64 pos = ceph_make_fpos(frag, off);
+
+	ctx->pos = ceph_make_fpos(frag, off);
+	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
 		struct ceph_mds_reply_inode *in =
 			rinfo->dir_in[off - fi->offset].in;
 		struct ceph_vino vino;
 		ino_t ino;
 
 		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
 		     rinfo->dir_dname_len[off - fi->offset],
 		     rinfo->dir_dname[off - fi->offset], in);
 		BUG_ON(!in);
@@ -376,16 +412,15 @@ more:
 		vino.ino = le64_to_cpu(in->ino);
 		vino.snap = le64_to_cpu(in->snapid);
 		ino = ceph_vino_to_ino(vino);
-		if (filldir(dirent,
+		if (!dir_emit(ctx,
 			    rinfo->dir_dname[off - fi->offset],
 			    rinfo->dir_dname_len[off - fi->offset],
-			    pos,
-			    ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+			    ceph_translate_ino(inode->i_sb, ino), ftype)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
 		off++;
-		filp->f_pos = pos + 1;
+		ctx->pos++;
 	}
 
 	if (fi->last_name) {
@@ -398,30 +433,29 @@ more:
 	if (!ceph_frag_is_rightmost(frag)) {
 		frag = ceph_frag_next(frag);
 		off = 0;
-		filp->f_pos = ceph_make_fpos(frag, off);
+		ctx->pos = ceph_make_fpos(frag, off);
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
-	fi->at_end = 1;
+	fi->flags |= CEPH_F_ATEND;
 
 	/*
 	 * if dir_release_count still matches the dir, no dentries
 	 * were released during the whole readdir, and we should have
 	 * the complete dir contents in our cache.
 	 */
-	spin_lock(&inode->i_lock);
-	if (ci->i_release_count == fi->dir_release_count) {
+	spin_lock(&ci->i_ceph_lock);
+	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
 		dout(" marking %p complete\n", inode);
-		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
-		ci->i_max_offset = filp->f_pos;
+		__ceph_dir_set_complete(ci, fi->dir_release_count);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
-	dout("readdir %p filp %p done.\n", inode, filp);
+	dout("readdir %p file %p done.\n", inode, file);
 	return 0;
 }
 
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
 {
 	if (fi->last_readdir) {
 		ceph_mdsc_put_request(fi->last_readdir);
@@ -429,35 +463,43 @@ static void reset_readdir(struct ceph_file_info *fi)
 	}
 	kfree(fi->last_name);
 	fi->last_name = NULL;
-	fi->next_offset = 2;  /* compensate for . and .. */
+	if (ceph_frag_is_leftmost(frag))
+		fi->next_offset = 2;  /* compensate for . and .. */
+	else
+		fi->next_offset = 0;
 	if (fi->dentry) {
 		dput(fi->dentry);
 		fi->dentry = NULL;
 	}
-	fi->at_end = 0;
+	fi->flags &= ~CEPH_F_ATEND;
 }
 
-static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_mapping->host;
-	loff_t old_offset = offset;
+	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
 	loff_t retval;
 
 	mutex_lock(&inode->i_mutex);
-	switch (origin) {
+	retval = -EINVAL;
+	switch (whence) {
 	case SEEK_END:
 		offset += inode->i_size + 2;   /* FIXME */
 		break;
 	case SEEK_CUR:
 		offset += file->f_pos;
+	case SEEK_SET:
+		break;
+	default:
+		goto out;
 	}
-	retval = -EINVAL;
-	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+
+	if (offset >= 0) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
-			fi->at_end = 0;
+			fi->flags &= ~CEPH_F_ATEND;
 		}
 		retval = offset;
 
@@ -466,36 +508,29 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 		 * seek to new frag, or seek prior to current chunk.
 		 */
 		if (offset == 0 ||
-		    fpos_frag(offset) != fpos_frag(old_offset) ||
+		    fpos_frag(offset) != fi->frag ||
 		    fpos_off(offset) < fi->offset) {
 			dout("dir_llseek dropping %p content\n", file);
-			reset_readdir(fi);
+			reset_readdir(fi, fpos_frag(offset));
 		}
 
 		/* bump dir_release_count if we did a forward seek */
-		if (offset > old_offset)
+		if (fpos_cmp(offset, old_offset) > 0)
 			fi->dir_release_count--;
 	}
+out:
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
 
 /*
- * Process result of a lookup/open request.
- *
- * Mainly, make sure we return the final req->r_dentry (if it already
- * existed) in place of the original VFS-provided dentry when they
- * differ.
- *
- * Gracefully handle the case where the MDS replies with -ENOENT and
- * no trace (which it may do, at its discretion, e.g., if it doesn't
- * care to issue a lease on the negative dentry).
+ * Handle lookups for the hidden .snap directory.
  */
-struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
-				  struct dentry *dentry, int err)
+int ceph_handle_snapdir(struct ceph_mds_request *req,
+			struct dentry *dentry, int err)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
-	struct inode *parent = dentry->d_parent->d_inode;
+	struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
@@ -509,7 +544,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 		d_add(dentry, inode);
 		err = 0;
 	}
+	return err;
+}
 
+/*
+ * Figure out final result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
 	if (err == -ENOENT) {
 		/* no trace? */
 		err = 0;
@@ -544,7 +595,7 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
  * the MDS so that it gets our 'caps wanted' value in a single op.
  */
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
+				  unsigned int flags)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -562,35 +613,26 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 	if (err < 0)
 		return ERR_PTR(err);
 
-	/* open (but not create!) intent? */
-	if (nd &&
-	    (nd->flags & LOOKUP_OPEN) &&
-	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
-	    !(nd->intent.open.flags & O_CREAT)) {
-		int mode = nd->intent.open.create_mode & ~current->fs->umask;
-		return ceph_lookup_open(dir, dentry, nd, mode, 1);
-	}
-
 	/* can we conclude ENOENT locally? */
 	if (dentry->d_inode == NULL) {
 		struct ceph_inode_info *ci = ceph_inode(dir);
 		struct ceph_dentry_info *di = ceph_dentry(dentry);
 
-		spin_lock(&dir->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
 			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
 		    !is_root_ceph_dentry(dir, dentry) &&
-		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+		    __ceph_dir_is_complete(ci) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-			spin_unlock(&dir->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 			dout(" dir %p complete, -ENOENT\n", dir);
 			d_add(dentry, NULL);
 			di->lease_shared_gen = ci->i_shared_gen;
 			return NULL;
 		}
-		spin_unlock(&dir->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 
 	op = ceph_snap(dir) == CEPH_SNAPDIR ?
@@ -604,6 +646,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_locked_dir = dir;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	err = ceph_handle_snapdir(req, dentry, err);
 	dentry = ceph_finish_lookup(req, dentry, err);
 	ceph_mdsc_put_request(req);  /* will dput(dentry) */
 	dout("lookup result=%p\n", dentry);
@@ -616,7 +659,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
  */
 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 {
-	struct dentry *result = ceph_lookup(dir, dentry, NULL);
+	struct dentry *result = ceph_lookup(dir, dentry, 0);
 
 	if (result && !IS_ERR(result)) {
 		/*
@@ -634,7 +677,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 }
 
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
-		      int mode, dev_t rdev)
+		      umode_t mode, dev_t rdev)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -644,7 +687,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
 	     dir, dentry, mode, rdev);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
@@ -662,31 +705,18 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 	if (!err && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
 	ceph_mdsc_put_request(req);
-	if (err)
+
+	if (!err)
+		ceph_init_acl(dentry, dentry->d_inode, dir);
+	else
 		d_drop(dentry);
 	return err;
 }
 
-static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
-		       struct nameidata *nd)
+static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       bool excl)
 {
-	dout("create in dir %p dentry %p name '%.*s'\n",
-	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
-
-	if (ceph_snap(dir) != CEPH_NOSNAP)
-		return -EROFS;
-
-	if (nd) {
-		BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
-		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
-		/* hrm, what should i do here if we get aliased? */
-		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
-		return 0;
-	}
-
-	/* fall back to mknod */
-	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+	return ceph_mknod(dir, dentry, mode, 0);
 }
 
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
@@ -716,12 +746,14 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 	if (!err && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
 	ceph_mdsc_put_request(req);
-	if (err)
+	if (!err)
+		ceph_init_acl(dentry, dentry->d_inode, dir);
+	else
 		d_drop(dentry);
 	return err;
 }
 
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -735,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
 		     dentry->d_name.len, dentry->d_name.name, dentry);
 	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
-		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
 		op = CEPH_MDS_OP_MKDIR;
 	} else {
 		goto out;
@@ -757,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		err = ceph_handle_notrace_create(dir, dentry);
 	ceph_mdsc_put_request(req);
 out:
-	if (err < 0)
+	if (!err)
+		ceph_init_acl(dentry, dentry->d_inode, dir);
+	else
 		d_drop(dentry);
 	return err;
 }
@@ -782,15 +816,19 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	}
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
-	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+	req->r_old_dentry = dget(old_dentry);
 	req->r_locked_dir = dir;
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
 	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_SHARED on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
 	err = ceph_mdsc_do_request(mdsc, dir, req);
-	if (err)
+	if (err) {
 		d_drop(dentry);
-	else if (!req->r_reply_info.head->is_dentry)
-		d_instantiate(dentry, igrab(old_dentry->d_inode));
+	} else if (!req->r_reply_info.head->is_dentry) {
+		ihold(old_dentry->d_inode);
+		d_instantiate(dentry, old_dentry->d_inode);
+	}
 	ceph_mdsc_put_request(req);
 	return err;
 }
@@ -806,12 +844,12 @@ static int drop_caps_for_unlink(struct inode *inode)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (inode->i_nlink == 1) {
 		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
 		ci->i_ceph_flags |= CEPH_I_NODELAY;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return drop;
 }
 
@@ -835,7 +873,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
 		dout("unlink/rmdir dir %p dn %p inode %p\n",
 		     dir, dentry, inode);
-		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+		op = S_ISDIR(dentry->d_inode->i_mode) ?
 			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
 	} else
 		goto out;
@@ -876,9 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
+	ihold(old_dir);
 	req->r_dentry = dget(new_dentry);
 	req->r_num_caps = 2;
 	req->r_old_dentry = dget(old_dentry);
+	req->r_old_dentry_dir = old_dir;
 	req->r_locked_dir = new_dir;
 	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
 	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -896,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 * to do it here.
 		 */
 
-		/* d_move screws up d_subdirs order */
-		ceph_i_clear(new_dir, CEPH_I_COMPLETE);
-
 		d_move(old_dentry, new_dentry);
 
 		/* ensure target dentry is invalidated, despite
 		   rehashing bug in vfs_rename_dir */
 		ceph_invalidate_dentry_lease(new_dentry);
+
+		/* d_move screws up sibling dentries' offsets */
+		ceph_dir_clear_complete(old_dir);
+		ceph_dir_clear_complete(new_dir);
+
 	}
 	ceph_mdsc_put_request(req);
 	return err;
@@ -937,12 +979,12 @@ static int dentry_lease_is_valid(struct dentry *dentry)
 
 	spin_lock(&dentry->d_lock);
 	di = ceph_dentry(dentry);
-	if (di && di->lease_session) {
+	if (di->lease_session) {
 		s = di->lease_session;
-		spin_lock(&s->s_cap_lock);
+		spin_lock(&s->s_gen_ttl_lock);
 		gen = s->s_cap_gen;
 		ttl = s->s_cap_ttl;
-		spin_unlock(&s->s_cap_lock);
+		spin_unlock(&s->s_gen_ttl_lock);
 
 		if (di->lease_gen == gen &&
 		    time_before(jiffies, dentry->d_time) &&
@@ -979,10 +1021,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	int valid = 0;
 
-	spin_lock(&dir->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_shared_gen == di->lease_shared_gen)
 		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
-	spin_unlock(&dir->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
 	     dir, (unsigned)ci->i_shared_gen, dentry,
 	     (unsigned)di->lease_shared_gen, valid);
@@ -992,38 +1034,45 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 /*
  * Check if cached dentry can be trusted.
  */
-static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
+	int valid = 0;
 	struct inode *dir;
 
-	if (nd && nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	dir = dentry->d_parent->d_inode;
-
 	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
 	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
 	     ceph_dentry(dentry)->offset);
 
+	dir = ceph_get_dentry_parent_inode(dentry);
+
 	/* always trust cached snapped dentries, snapdir dentry */
 	if (ceph_snap(dir) != CEPH_NOSNAP) {
 		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
 		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-		goto out_touch;
+		valid = 1;
+	} else if (dentry->d_inode &&
+		   ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
+		valid = 1;
+	} else if (dentry_lease_is_valid(dentry) ||
+		   dir_lease_is_valid(dir, dentry)) {
+		if (dentry->d_inode)
+			valid = ceph_is_any_caps(dentry->d_inode);
+		else
+			valid = 1;
 	}
-	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
-		goto out_touch;
-
-	if (dentry_lease_is_valid(dentry) ||
-	    dir_lease_is_valid(dir, dentry))
-		goto out_touch;
 
-	dout("d_revalidate %p invalid\n", dentry);
-	d_drop(dentry);
-	return 0;
-out_touch:
-	ceph_dentry_lru_touch(dentry);
-	return 1;
+	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+	if (valid) {
+		ceph_dentry_lru_touch(dentry);
+	} else {
+		ceph_dir_clear_complete(dir);
+		d_drop(dentry);
+	}
+	iput(dir);
+	return valid;
 }
 
 /*
@@ -1034,17 +1083,15 @@ static void ceph_d_release(struct dentry *dentry)
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 
 	dout("d_release %p\n", dentry);
-	if (di) {
-		ceph_dentry_lru_del(dentry);
-		if (di->lease_session)
-			ceph_put_mds_session(di->lease_session);
-		kmem_cache_free(ceph_dentry_cachep, di);
-		dentry->d_fsdata = NULL;
-	}
+	ceph_dentry_lru_del(dentry);
+	if (di->lease_session)
+		ceph_put_mds_session(di->lease_session);
+	kmem_cache_free(ceph_dentry_cachep, di);
+	dentry->d_fsdata = NULL;
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
-					  struct nameidata *nd)
+					  unsigned int flags)
 {
 	/*
 	 * Eventually, we'll want to revalidate snapped metadata
@@ -1053,7 +1100,30 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
 	return 1;
 }
 
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+	dout("ceph_d_prune %p\n", dentry);
+
+	/* do we have a valid parent? */
+	if (IS_ROOT(dentry))
+		return;
 
+	/* if we are not hashed, we don't affect dir's completeness */
+	if (d_unhashed(dentry))
+		return;
+
+	/*
+	 * we hold d_lock, so d_parent is stable, and d_fsdata is never
+	 * cleared until d_release
+	 */
+	ceph_dir_clear_complete(dentry->d_parent->d_inode);
+}
 
 /*
  * read() on a dir.  This weird interface hack only works if mounted
@@ -1063,19 +1133,20 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 			     loff_t *ppos)
 {
 	struct ceph_file_info *cf = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
+	const int bufsize = 1024;
 
 	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!cf->dir_info) {
-		cf->dir_info = kmalloc(1024, GFP_NOFS);
+		cf->dir_info = kmalloc(bufsize, GFP_NOFS);
 		if (!cf->dir_info)
 			return -ENOMEM;
 		cf->dir_info_len =
-			sprintf(cf->dir_info,
+			snprintf(cf->dir_info, bufsize,
 				"entries:   %20lld\n"
 				" files:    %20lld\n"
 				" subdirs:  %20lld\n"
@@ -1109,9 +1180,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
  * an fsync() on a dir will wait for any uncommitted directory
  * operations to commit.
  */
-static int ceph_dir_fsync(struct file *file, int datasync)
+static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct list_head *head = &ci->i_unsafe_dirops;
 	struct ceph_mds_request *req;
@@ -1119,6 +1191,11 @@ static int ceph_dir_fsync(struct file *file, int datasync)
 	int ret = 0;
 
 	dout("dir_fsync %p\n", inode);
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+	mutex_lock(&inode->i_mutex);
+
 	spin_lock(&ci->i_unsafe_lock);
 	if (list_empty(head))
 		goto out;
@@ -1130,6 +1207,7 @@ static int ceph_dir_fsync(struct file *file, int datasync)
 	do {
 		ceph_mdsc_get_request(req);
 		spin_unlock(&ci->i_unsafe_lock);
+
 		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
 		     inode, req->r_tid, last_tid);
 		if (req->r_timeout) {
@@ -1142,9 +1220,9 @@ static int ceph_dir_fsync(struct file *file, int datasync)
 		} else {
 			wait_for_completion(&req->r_safe_completion);
 		}
-		spin_lock(&ci->i_unsafe_lock);
 		ceph_mdsc_put_request(req);
 
+		spin_lock(&ci->i_unsafe_lock);
 		if (ret || list_empty(head))
 			break;
 		req = list_entry(head->next,
@@ -1152,6 +1230,8 @@ static int ceph_dir_fsync(struct file *file, int datasync)
 	} while (req->r_tid < last_tid);
 out:
 	spin_unlock(&ci->i_unsafe_lock);
+	mutex_unlock(&inode->i_mutex);
+
 	return ret;
 }
 
@@ -1167,13 +1247,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
 
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
-	if (di) {
-		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-		spin_lock(&mdsc->dentry_lru_lock);
-		list_add_tail(&di->lru, &mdsc->dentry_lru);
-		mdsc->num_dentry++;
-		spin_unlock(&mdsc->dentry_lru_lock);
-	}
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_add_tail(&di->lru, &mdsc->dentry_lru);
+	mdsc->num_dentry++;
+	spin_unlock(&mdsc->dentry_lru_lock);
 }
 
 void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1183,12 +1261,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 
 	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
 	     dn->d_name.len, dn->d_name.name, di->offset);
-	if (di) {
-		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-		spin_lock(&mdsc->dentry_lru_lock);
-		list_move_tail(&di->lru, &mdsc->dentry_lru);
-		spin_unlock(&mdsc->dentry_lru_lock);
-	}
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_move_tail(&di->lru, &mdsc->dentry_lru);
+	spin_unlock(&mdsc->dentry_lru_lock);
 }
 
 void ceph_dentry_lru_del(struct dentry *dn)
@@ -1198,22 +1274,19 @@ void ceph_dentry_lru_del(struct dentry *dn)
 
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
-	if (di) {
-		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-		spin_lock(&mdsc->dentry_lru_lock);
-		list_del_init(&di->lru);
-		mdsc->num_dentry--;
-		spin_unlock(&mdsc->dentry_lru_lock);
-	}
+	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_del_init(&di->lru);
+	mdsc->num_dentry--;
+	spin_unlock(&mdsc->dentry_lru_lock);
 }
 
 /*
  * Return name hash for a given dentry.  This is dependent on
  * the parent directory's hash function.
  */
-unsigned ceph_dentry_hash(struct dentry *dn)
+unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 {
-	struct inode *dir = dn->d_parent->d_inode;
 	struct ceph_inode_info *dci = ceph_inode(dir);
 
 	switch (dci->i_dir_layout.dl_dir_hash) {
@@ -1229,7 +1302,7 @@ unsigned ceph_dentry_hash(struct dentry *dn)
 
 const struct file_operations ceph_dir_fops = {
 	.read = ceph_read_dir,
-	.readdir = ceph_readdir,
+	.iterate = ceph_readdir,
 	.llseek = ceph_dir_llseek,
 	.open = ceph_open,
 	.release = ceph_release,
@@ -1246,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = {
 	.getxattr = ceph_getxattr,
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
 	.mknod = ceph_mknod,
 	.symlink = ceph_symlink,
 	.mkdir = ceph_mkdir,
@@ -1254,11 +1329,13 @@ const struct inode_operations ceph_dir_iops = {
 	.rmdir = ceph_unlink,
 	.rename = ceph_rename,
 	.create = ceph_create,
+	.atomic_open = ceph_atomic_open,
 };
 
 const struct dentry_operations ceph_dentry_ops = {
 	.d_revalidate = ceph_d_revalidate,
 	.d_release = ceph_d_release,
+	.d_prune = ceph_d_prune,
 };
 
 const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1268,4 +1345,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
 
 const struct dentry_operations ceph_snap_dentry_ops = {
 	.d_release = ceph_d_release,
+	.d_prune = ceph_d_prune,
 };
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf..8d7d782f438 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
 #include "mds_client.h"
 
 /*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best.  If you're lucky, your inode will be in the
- * client's cache.  If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you.  Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
  * Basic fh
  */
 struct ceph_nfs_fh {
@@ -32,22 +15,18 @@ struct ceph_nfs_fh {
 } __attribute__ ((packed));
 
 /*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
+ * Larger fh that includes parent ino.
  */
 struct ceph_nfs_confh {
 	u64 ino, parent_ino;
-	u32 parent_name_hash;
 } __attribute__ ((packed));
 
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
-			  int connectable)
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+			  struct inode *parent_inode)
 {
 	int type;
 	struct ceph_nfs_fh *fh = (void *)rawfh;
 	struct ceph_nfs_confh *cfh = (void *)rawfh;
-	struct dentry *parent = dentry->d_parent;
-	struct inode *inode = dentry->d_inode;
 	int connected_handle_length = sizeof(*cfh)/4;
 	int handle_length = sizeof(*fh)/4;
 
@@ -55,176 +34,217 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EINVAL;
 
-	if (*max_len >= connected_handle_length) {
-		dout("encode_fh %p connectable\n", dentry);
-		cfh->ino = ceph_ino(dentry->d_inode);
-		cfh->parent_ino = ceph_ino(parent->d_inode);
-		cfh->parent_name_hash = ceph_dentry_hash(parent);
+	if (parent_inode && (*max_len < connected_handle_length)) {
 		*max_len = connected_handle_length;
-		type = 2;
-	} else if (*max_len >= handle_length) {
-		if (connectable) {
-			*max_len = connected_handle_length;
-			return 255;
-		}
-		dout("encode_fh %p\n", dentry);
-		fh->ino = ceph_ino(dentry->d_inode);
+		return FILEID_INVALID;
+	} else if (*max_len < handle_length) {
 		*max_len = handle_length;
-		type = 1;
+		return FILEID_INVALID;
+	}
+
+	if (parent_inode) {
+		dout("encode_fh %llx with parent %llx\n",
+		     ceph_ino(inode), ceph_ino(parent_inode));
+		cfh->ino = ceph_ino(inode);
+		cfh->parent_ino = ceph_ino(parent_inode);
+		*max_len = connected_handle_length;
+		type = FILEID_INO32_GEN_PARENT;
 	} else {
+		dout("encode_fh %llx\n", ceph_ino(inode));
+		fh->ino = ceph_ino(inode);
 		*max_len = handle_length;
-		return 255;
+		type = FILEID_INO32_GEN;
 	}
 	return type;
 }
 
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
-				     struct ceph_nfs_fh *fh)
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
 	struct dentry *dentry;
 	struct ceph_vino vino;
 	int err;
 
-	dout("__fh_to_dentry %llx\n", fh->ino);
-	vino.ino = fh->ino;
+	vino.ino = ino;
 	vino.snap = CEPH_NOSNAP;
 	inode = ceph_find_inode(sb, vino);
-	if (!inode)
-		return ERR_PTR(-ESTALE);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_CAST(req);
+
+		req->r_ino1 = vino;
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		inode = req->r_target_inode;
+		if (inode)
+			ihold(inode);
+		ceph_mdsc_put_request(req);
+		if (!inode)
+			return ERR_PTR(-ESTALE);
+	}
 
 	dentry = d_obtain_alias(inode);
 	if (IS_ERR(dentry)) {
-		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
-		       fh->ino, inode);
 		iput(inode);
 		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
-
 	if (err < 0) {
-		iput(inode);
+		dput(dentry);
 		return ERR_PTR(err);
 	}
-	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+	dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
 	return dentry;
 }
 
 /*
- * convert connectable fh to dentry
+ * convert regular fh to dentry
  */
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
-				      struct ceph_nfs_confh *cfh)
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+					struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+	if (fh_type != FILEID_INO32_GEN  &&
+	    fh_type != FILEID_INO32_GEN_PARENT)
+		return NULL;
+	if (fh_len < sizeof(*fh) / 4)
+		return NULL;
+
+	dout("fh_to_dentry %llx\n", fh->ino);
+	return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+				   struct dentry *child, u64 ino)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_request *req;
 	struct inode *inode;
 	struct dentry *dentry;
-	struct ceph_vino vino;
 	int err;
 
-	dout("__cfh_to_dentry %llx (%llx/%x)\n",
-	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
-	vino.ino = cfh->ino;
-	vino.snap = CEPH_NOSNAP;
-	inode = ceph_find_inode(sb, vino);
-	if (!inode) {
-		struct ceph_mds_request *req;
-
-		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
-					       USE_ANY_MDS);
-		if (IS_ERR(req))
-			return ERR_CAST(req);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+				       USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
 
-		req->r_ino1 = vino;
-		req->r_ino2.ino = cfh->parent_ino;
-		req->r_ino2.snap = CEPH_NOSNAP;
-		req->r_path2 = kmalloc(16, GFP_NOFS);
-		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
-		req->r_num_caps = 1;
-		err = ceph_mdsc_do_request(mdsc, NULL, req);
-		ceph_mdsc_put_request(req);
-		inode = ceph_find_inode(sb, vino);
-		if (!inode)
-			return ERR_PTR(err ? err : -ESTALE);
+	if (child) {
+		req->r_inode = child->d_inode;
+		ihold(child->d_inode);
+	} else {
+		req->r_ino1 = (struct ceph_vino) {
+			.ino = ino,
+			.snap = CEPH_NOSNAP,
+		};
 	}
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	inode = req->r_target_inode;
+	if (inode)
+		ihold(inode);
+	ceph_mdsc_put_request(req);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
 
 	dentry = d_obtain_alias(inode);
 	if (IS_ERR(dentry)) {
-		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
-		       cfh->ino, inode);
 		iput(inode);
 		return dentry;
 	}
 	err = ceph_init_dentry(dentry);
 	if (err < 0) {
-		iput(inode);
+		dput(dentry);
 		return ERR_PTR(err);
 	}
-	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+	     child ? ceph_ino(child->d_inode) : ino,
+	     dentry, ceph_vinop(inode));
 	return dentry;
 }
 
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
-					int fh_len, int fh_type)
+static struct dentry *ceph_get_parent(struct dentry *child)
 {
-	if (fh_type == 1)
-		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
-	else
-		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+	/* don't re-export snaps */
+	if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+		return ERR_PTR(-EINVAL);
+
+	dout("get_parent %p ino %llx.%llx\n",
+	     child, ceph_vinop(child->d_inode));
+	return __get_parent(child->d_sb, child, 0);
 }
 
 /*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
+ * convert regular fh to parent
  */
 static struct dentry *ceph_fh_to_parent(struct super_block *sb,
-					 struct fid *fid,
+					struct fid *fid,
 					int fh_len, int fh_type)
 {
 	struct ceph_nfs_confh *cfh = (void *)fid->raw;
-	struct ceph_vino vino;
-	struct inode *inode;
 	struct dentry *dentry;
-	int err;
 
-	if (fh_type == 1)
-		return ERR_PTR(-ESTALE);
+	if (fh_type != FILEID_INO32_GEN_PARENT)
+		return NULL;
+	if (fh_len < sizeof(*cfh) / 4)
+		return NULL;
 
-	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
-		 cfh->parent_name_hash);
+	dout("fh_to_parent %llx\n", cfh->parent_ino);
+	dentry = __get_parent(sb, NULL, cfh->ino);
+	if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+		dentry = __fh_to_dentry(sb, cfh->parent_ino);
+	return dentry;
+}
 
-	vino.ino = cfh->ino;
-	vino.snap = CEPH_NOSNAP;
-	inode = ceph_find_inode(sb, vino);
-	if (!inode)
-		return ERR_PTR(-ESTALE);
+static int ceph_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct ceph_mds_client *mdsc;
+	struct ceph_mds_request *req;
+	int err;
 
-	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry)) {
-		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
-		       cfh->ino, inode);
-		iput(inode);
-		return dentry;
-	}
-	err = ceph_init_dentry(dentry);
-	if (err < 0) {
-		iput(inode);
-		return ERR_PTR(err);
+	mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+				       USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	mutex_lock(&parent->d_inode->i_mutex);
+
+	req->r_inode = child->d_inode;
+	ihold(child->d_inode);
+	req->r_ino2 = ceph_vino(parent->d_inode);
+	req->r_locked_dir = parent->d_inode;
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+
+	mutex_unlock(&parent->d_inode->i_mutex);
+
+	if (!err) {
+		struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+		memcpy(name, rinfo->dname, rinfo->dname_len);
+		name[rinfo->dname_len] = 0;
+		dout("get_name %p ino %llx.%llx name %s\n",
+		     child, ceph_vinop(child->d_inode), name);
+	} else {
+		dout("get_name %p ino %llx.%llx err %d\n",
+		     child, ceph_vinop(child->d_inode), err);
 	}
-	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
-	return dentry;
+
+	ceph_mdsc_put_request(req);
+	return err;
 }
 
 const struct export_operations ceph_export_ops = {
 	.encode_fh = ceph_encode_fh,
 	.fh_to_dentry = ceph_fh_to_dentry,
 	.fh_to_parent = ceph_fh_to_parent,
+	.get_parent = ceph_get_parent,
+	.get_name = ceph_get_name,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 159b512d5a2..302085100c2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -4,11 +4,15 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
+#include <linux/falloc.h>
 
 #include "super.h"
 #include "mds_client.h"
+#include "cache.h"
 
 /*
  * Ceph file operations
@@ -54,7 +58,6 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
 	req->r_fmode = ceph_flags_to_mode(flags);
 	req->r_args.open.flags = cpu_to_le32(flags);
 	req->r_args.open.mode = cpu_to_le32(create_mode);
-	req->r_args.open.preferred = cpu_to_le32(-1);
 out:
 	return req;
 }
@@ -67,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 {
 	struct ceph_file_info *cf;
 	int ret = 0;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
+		/* First file open request creates the cookie, we want to keep
+		 * this cookie around for the filetime of the inode as not to
+		 * have to worry about fscache register / revoke / operation
+		 * races.
+		 *
+		 * Also, if we know the operation is going to invalidate data
+		 * (non readonly) just nuke the cache right away.
+		 */
+		ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
+		if ((fmode & CEPH_FILE_MODE_WR))
+			ceph_fscache_invalidate(inode);
 	case S_IFDIR:
 		dout("init_file %p %p 0%o (regular)\n", inode, file,
 		     inode->i_mode);
@@ -107,9 +124,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 }
 
 /*
- * If the filp already has private_data, that means the file was
- * already opened by intent during lookup, and we do nothing.
- *
  * If we already have the requisite capabilities, we can satisfy
  * the open request locally (no need to request new caps from the
  * MDS).  We do, however, need to inform the MDS (asynchronously)
@@ -122,7 +136,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
-	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct inode *parent_inode = NULL;
 	int err;
 	int flags, fmode, wanted;
 
@@ -147,9 +161,9 @@ int ceph_open(struct inode *inode, struct file *file)
 
 	/* trivially open snapdir */
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		__ceph_get_fmode(ci, fmode);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
 	}
 
@@ -158,7 +172,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	 * write) or any MDS (for read).  Update wanted set
 	 * asynchronously.
 	 */
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (__ceph_is_any_real_caps(ci) &&
 	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
 		int mds_wanted = __ceph_caps_mds_wanted(ci);
@@ -168,7 +182,7 @@ int ceph_open(struct inode *inode, struct file *file)
 		     inode, fmode, ceph_cap_string(wanted),
 		     ceph_cap_string(issued));
 		__ceph_get_fmode(ci, fmode);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 
 		/* adjust wanted? */
 		if ((issued & wanted) != wanted &&
@@ -180,10 +194,11 @@ int ceph_open(struct inode *inode, struct file *file)
 	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
 		   (ci->i_snap_caps & wanted) == wanted) {
 		__ceph_get_fmode(ci, fmode);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
 	}
-	spin_unlock(&inode->i_lock);
+
+	spin_unlock(&ci->i_ceph_lock);
 
 	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
 	req = prepare_open_request(inode->i_sb, flags, 0);
@@ -191,9 +206,14 @@ int ceph_open(struct inode *inode, struct file *file)
 		err = PTR_ERR(req);
 		goto out;
 	}
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
+
 	req->r_num_caps = 1;
+	if (flags & O_CREAT)
+		parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	iput(parent_inode);
 	if (!err)
 		err = ceph_init_file(inode, file, req->r_fmode);
 	ceph_mdsc_put_request(req);
@@ -204,36 +224,34 @@ out:
 
 
 /*
- * Do a lookup + open with a single request.
- *
- * If this succeeds, but some subsequent check in the vfs
- * may_open() fails, the struct *file gets cleaned up (i.e.
- * ceph_release gets called).  So fear not!
- */
-/*
- * flags
- *  path_lookup_open   -> LOOKUP_OPEN
- *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ * Do a lookup + open with a single request.  If we get a non-existent
+ * file or symlink, return 1 so the VFS can retry.
  */
-struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
-				struct nameidata *nd, int mode,
-				int locked_dir)
+int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
+		     struct file *file, unsigned flags, umode_t mode,
+		     int *opened)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	struct file *file = nd->intent.open.file;
-	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
 	struct ceph_mds_request *req;
+	struct dentry *dn;
 	int err;
-	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
 
-	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
-	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+	dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name,
+	     d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return -ENAMETOOLONG;
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return err;
 
 	/* do the open */
 	req = prepare_open_request(dir->i_sb, flags, mode);
 	if (IS_ERR(req))
-		return ERR_CAST(req);
+		return PTR_ERR(req);
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
 	if (flags & O_CREAT) {
@@ -241,16 +259,44 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
 	}
 	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-	dentry = ceph_finish_lookup(req, dentry, err);
-	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+	err = ceph_mdsc_do_request(mdsc,
+				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
+				   req);
+	if (err)
+		goto out_err;
+
+	err = ceph_handle_snapdir(req, dentry, err);
+	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
-	if (!err)
-		err = ceph_init_file(req->r_dentry->d_inode, file,
-				     req->r_fmode);
+
+	if (d_unhashed(dentry)) {
+		dn = ceph_finish_lookup(req, dentry, err);
+		if (IS_ERR(dn))
+			err = PTR_ERR(dn);
+	} else {
+		/* we were given a hashed negative dentry */
+		dn = NULL;
+	}
+	if (err)
+		goto out_err;
+	if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
+		/* make vfs retry on splice, ENOENT, or symlink */
+		dout("atomic_open finish_no_open on dn %p\n", dn);
+		err = finish_no_open(file, dn);
+	} else {
+		dout("atomic_open finish_open on dn %p\n", dn);
+		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+			ceph_init_acl(dentry, dentry->d_inode, dir);
+			*opened |= FILE_CREATED;
+		}
+		err = finish_open(file, dentry, ceph_open, opened);
+	}
+out_err:
+	if (!req->r_err && req->r_target_inode)
+		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
 	ceph_mdsc_put_request(req);
-	dout("ceph_lookup_open result=%p\n", dentry);
-	return dentry;
+	dout("atomic_open result=%d\n", err);
+	return err;
 }
 
 int ceph_release(struct inode *inode, struct file *file)
@@ -282,15 +328,14 @@ int ceph_release(struct inode *inode, struct file *file)
 static int striped_read(struct inode *inode,
 			u64 off, u64 len,
 			struct page **pages, int num_pages,
-			int *checkeof, bool align_to_pages,
+			int *checkeof, bool o_direct,
 			unsigned long buf_align)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	u64 pos, this_len;
+	u64 pos, this_len, left;
 	int io_align, page_align;
-	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
-	int left, pages_left;
+	int pages_left;
 	int read;
 	struct page **page_pos;
 	int ret;
@@ -307,7 +352,7 @@ static int striped_read(struct inode *inode,
 	io_align = off & ~PAGE_MASK;
 
 more:
-	if (align_to_pages)
+	if (o_direct)
 		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
 	else
 		page_align = pos & ~PAGE_MASK;
@@ -317,50 +362,44 @@ more:
 				  ci->i_truncate_seq,
 				  ci->i_truncate_size,
 				  page_pos, pages_left, page_align);
-	hit_stripe = this_len < left;
-	was_short = ret >= 0 && ret < this_len;
 	if (ret == -ENOENT)
 		ret = 0;
-	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+	hit_stripe = this_len < left;
+	was_short = ret >= 0 && ret < this_len;
+	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
 	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
-	if (ret > 0) {
-		int didpages =
-			((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
-
-		if (read < pos - off) {
-			dout(" zero gap %llu to %llu\n", off + read, pos);
-			ceph_zero_page_vector_range(page_off + read,
-						    pos - off - read, pages);
+	if (ret >= 0) {
+		int didpages;
+		if (was_short && (pos + ret < inode->i_size)) {
+			u64 tmp = min(this_len - ret,
+					inode->i_size - pos - ret);
+			dout(" zero gap %llu to %llu\n",
+				pos + ret, pos + ret + tmp);
+			ceph_zero_page_vector_range(page_align + read + ret,
+							tmp, pages);
+			ret += tmp;
 		}
+
+		didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
 		pos += ret;
 		read = pos - off;
 		left -= ret;
 		page_pos += didpages;
 		pages_left -= didpages;
 
-		/* hit stripe? */
-		if (left && hit_stripe)
+		/* hit stripe and need continue*/
+		if (left && hit_stripe && pos < inode->i_size)
 			goto more;
 	}
 
-	if (was_short) {
-		/* was original extent fully inside i_size? */
-		if (pos + left <= inode->i_size) {
-			dout("zero tail\n");
-			ceph_zero_page_vector_range(page_off + read, len - read,
-						    pages);
-			read = len;
-			goto out;
-		}
-
-		/* check i_size */
-		*checkeof = 1;
+	if (read > 0) {
+		ret = read;
+		/* did we bounce off eof? */
+		if (pos + left > inode->i_size)
+			*checkeof = 1;
 	}
 
-out:
-	if (ret >= 0)
-		ret = read;
 	dout("striped_read returns %d\n", ret);
 	return ret;
 }
@@ -371,235 +410,361 @@ out:
  *
  * If the read spans object boundary, just do multiple reads.
  */
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
-			      unsigned len, loff_t *poff, int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
+				int *checkeof)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
 	struct page **pages;
-	u64 off = *poff;
+	u64 off = iocb->ki_pos;
 	int num_pages, ret;
+	size_t len = iov_iter_count(i);
 
-	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+	dout("sync_read on file %p %llu~%u %s\n", file, off,
+	     (unsigned)len,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
-	if (file->f_flags & O_DIRECT) {
-		num_pages = calc_pages_for((unsigned long)data, len);
-		pages = ceph_get_direct_page_vector(data, num_pages, true);
-	} else {
-		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
-	}
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-
 	/*
 	 * flush any page cache pages in this range.  this
 	 * will make concurrent normal and sync io slow,
 	 * but it will at least behave sensibly when they are
 	 * in sequence.
 	 */
-	ret = filemap_write_and_wait(inode->i_mapping);
+	ret = filemap_write_and_wait_range(inode->i_mapping, off,
+						off + len);
 	if (ret < 0)
-		goto done;
+		return ret;
 
-	ret = striped_read(inode, off, len, pages, num_pages, checkeof,
-			   file->f_flags & O_DIRECT,
-			   (unsigned long)data & ~PAGE_MASK);
+	if (file->f_flags & O_DIRECT) {
+		while (iov_iter_count(i)) {
+			size_t start;
+			ssize_t n;
 
-	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
-	if (ret >= 0)
-		*poff = off + ret;
+			n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
+			if (n < 0)
+				return n;
 
-done:
-	if (file->f_flags & O_DIRECT)
-		ceph_put_page_vector(pages, num_pages, true);
-	else
+			num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+			ret = striped_read(inode, off, n,
+					   pages, num_pages, checkeof,
+					   1, start);
+
+			ceph_put_page_vector(pages, num_pages, true);
+
+			if (ret <= 0)
+				break;
+			off += ret;
+			iov_iter_advance(i, ret);
+			if (ret < n)
+				break;
+		}
+	} else {
+		num_pages = calc_pages_for(off, len);
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		if (IS_ERR(pages))
+			return PTR_ERR(pages);
+		ret = striped_read(inode, off, len, pages,
+					num_pages, checkeof, 0, 0);
+		if (ret > 0) {
+			int l, k = 0;
+			size_t left = ret;
+
+			while (left) {
+				int copy = min_t(size_t, PAGE_SIZE, left);
+				l = copy_page_to_iter(pages[k++], 0, copy, i);
+				off += l;
+				left -= l;
+				if (l < copy)
+					break;
+			}
+		}
 		ceph_release_page_vector(pages, num_pages);
+	}
+
+	if (off > iocb->ki_pos) {
+		ret = off - iocb->ki_pos;
+		iocb->ki_pos = off;
+	}
+
 	dout("sync_read result %d\n", ret);
 	return ret;
 }
 
 /*
- * Write commit callback, called if we requested both an ACK and
- * ONDISK commit reply from the OSD.
+ * Write commit request unsafe callback, called to tell us when a
+ * request is unsafe (that is, in flight--has been handed to the
+ * messenger to send to its target osd).  It is called again when
+ * we've received a response message indicating the request is
+ * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
+ * is completed early (and unsuccessfully) due to a timeout or
+ * interrupt.
+ *
+ * This is used if we requested both an ACK and ONDISK commit reply
+ * from the OSD.
  */
-static void sync_write_commit(struct ceph_osd_request *req,
-			      struct ceph_msg *msg)
+static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 {
 	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
 
-	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
-	spin_lock(&ci->i_unsafe_lock);
-	list_del_init(&req->r_unsafe_item);
-	spin_unlock(&ci->i_unsafe_lock);
-	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
+		unsafe ? "un" : "");
+	if (unsafe) {
+		ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		spin_lock(&ci->i_unsafe_lock);
+		list_add_tail(&req->r_unsafe_item,
+			      &ci->i_unsafe_writes);
+		spin_unlock(&ci->i_unsafe_lock);
+	} else {
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_item);
+		spin_unlock(&ci->i_unsafe_lock);
+		ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	}
 }
 
+
 /*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
+ * Synchronous write, straight from __user pointer or user pages.
  *
  * If write spans object boundary, just do multiple writes.  (For a
  * correct atomic write, we should e.g. take write locks on all
  * objects, rollback on failure, etc.)
  */
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
-			       size_t left, loff_t *offset)
+static ssize_t
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	struct page **pages;
 	int num_pages;
-	long long unsigned pos;
-	u64 len;
 	int written = 0;
 	int flags;
-	int do_sync = 0;
 	int check_caps = 0;
-	int page_align, io_align;
-	unsigned long buf_align;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
+	loff_t pos = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
 
-	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
-	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
-	if (file->f_flags & O_APPEND)
-		pos = i_size_read(inode);
-	else
-		pos = *offset;
+	dout("sync_direct_write on file %p %lld~%u\n", file, pos,
+	     (unsigned)count);
 
-	io_align = pos & ~PAGE_MASK;
-	buf_align = (unsigned long)data & ~PAGE_MASK;
-
-	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
 	if (ret < 0)
 		return ret;
 
 	ret = invalidate_inode_pages2_range(inode->i_mapping,
 					    pos >> PAGE_CACHE_SHIFT,
-					    (pos + left) >> PAGE_CACHE_SHIFT);
+					    (pos + count) >> PAGE_CACHE_SHIFT);
 	if (ret < 0)
 		dout("invalidate_inode_pages2_range returned %d\n", ret);
 
 	flags = CEPH_OSD_FLAG_ORDERSNAP |
 		CEPH_OSD_FLAG_ONDISK |
 		CEPH_OSD_FLAG_WRITE;
-	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
-		flags |= CEPH_OSD_FLAG_ACK;
-	else
-		do_sync = 1;
 
-	/*
-	 * we may need to do multiple writes here if we span an object
-	 * boundary.  this isn't atomic, unfortunately.  :(
-	 */
-more:
-	len = left;
-	if (file->f_flags & O_DIRECT) {
-		/* write from beginning of first page, regardless of
-		   io alignment */
-		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
-		num_pages = calc_pages_for((unsigned long)data, len);
-	} else {
-		page_align = pos & ~PAGE_MASK;
-		num_pages = calc_pages_for(pos, len);
-	}
-	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-				    ceph_vino(inode), pos, &len,
-				    CEPH_OSD_OP_WRITE, flags,
-				    ci->i_snap_realm->cached_context,
-				    do_sync,
-				    ci->i_truncate_seq, ci->i_truncate_size,
-				    &mtime, false, 2, page_align);
-	if (!req)
-		return -ENOMEM;
+	while (iov_iter_count(from) > 0) {
+		u64 len = iov_iter_single_seg_count(from);
+		size_t start;
+		ssize_t n;
+
+		snapc = ci->i_snap_realm->cached_context;
+		vino = ceph_vino(inode);
+		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					    vino, pos, &len,
+					    2,/*include a 'startsync' command*/
+					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			break;
+		}
 
-	if (file->f_flags & O_DIRECT) {
-		pages = ceph_get_direct_page_vector(data, num_pages, false);
-		if (IS_ERR(pages)) {
-			ret = PTR_ERR(pages);
-			goto out;
+		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
+		if (unlikely(n < 0)) {
+			ret = n;
+			ceph_osdc_put_request(req);
+			break;
 		}
 
+		num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
 		/*
 		 * throw out any page cache pages in this range. this
 		 * may block.
 		 */
 		truncate_inode_pages_range(inode->i_mapping, pos,
-					   (pos+len) | (PAGE_CACHE_SIZE-1));
-	} else {
+				   (pos+n) | (PAGE_CACHE_SIZE-1));
+		osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
+						false, false);
+
+		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+		ceph_put_page_vector(pages, num_pages, false);
+
+		ceph_osdc_put_request(req);
+		if (ret)
+			break;
+		pos += n;
+		written += n;
+		iov_iter_advance(from, n);
+
+		if (pos > i_size_read(inode)) {
+			check_caps = ceph_inode_set_size(inode, pos);
+			if (check_caps)
+				ceph_check_caps(ceph_inode(inode),
+						CHECK_CAPS_AUTHONLY,
+						NULL);
+		}
+	}
+
+	if (ret != -EOLDSNAPC && written > 0) {
+		iocb->ki_pos = pos;
+		ret = written;
+	}
+	return ret;
+}
+
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
+	struct ceph_osd_request *req;
+	struct page **pages;
+	u64 len;
+	int num_pages;
+	int written = 0;
+	int flags;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+	loff_t pos = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
+
+	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + count) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE |
+		CEPH_OSD_FLAG_ACK;
+
+	while ((len = iov_iter_count(from)) > 0) {
+		size_t left;
+		int n;
+
+		snapc = ci->i_snap_realm->cached_context;
+		vino = ceph_vino(inode);
+		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					    vino, pos, &len, 1,
+					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			break;
+		}
+
+		/*
+		 * write from beginning of first page,
+		 * regardless of io alignment
+		 */
+		num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
 		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
 		}
-		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
+
+		left = len;
+		for (n = 0; n < num_pages; n++) {
+			size_t plen = min_t(size_t, left, PAGE_SIZE);
+			ret = copy_page_from_iter(pages[n], 0, plen, from);
+			if (ret != plen) {
+				ret = -EFAULT;
+				break;
+			}
+			left -= ret;
+		}
+
 		if (ret < 0) {
 			ceph_release_page_vector(pages, num_pages);
 			goto out;
 		}
 
-		if ((file->f_flags & O_SYNC) == 0) {
-			/* get a second commit callback */
-			req->r_safe_callback = sync_write_commit;
-			req->r_own_pages = 1;
-		}
-	}
-	req->r_pages = pages;
-	req->r_num_pages = num_pages;
-	req->r_inode = inode;
+		/* get a second commit callback */
+		req->r_unsafe_callback = ceph_sync_write_unsafe;
+		req->r_inode = inode;
 
-	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!ret) {
-		if (req->r_safe_callback) {
-			/*
-			 * Add to inode unsafe list only after we
-			 * start_request so that a tid has been assigned.
-			 */
-			spin_lock(&ci->i_unsafe_lock);
-			list_add_tail(&req->r_unsafe_item,
-				      &ci->i_unsafe_writes);
-			spin_unlock(&ci->i_unsafe_lock);
-			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
-		}
-		
-		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-		if (ret < 0 && req->r_safe_callback) {
-			spin_lock(&ci->i_unsafe_lock);
-			list_del_init(&req->r_unsafe_item);
-			spin_unlock(&ci->i_unsafe_lock);
-			ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-		}
-	}
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+						false, true);
 
-	if (file->f_flags & O_DIRECT)
-		ceph_put_page_vector(pages, num_pages, false);
-	else if (file->f_flags & O_SYNC)
-		ceph_release_page_vector(pages, num_pages);
+		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (!ret)
+			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 out:
-	ceph_osdc_put_request(req);
-	if (ret == 0) {
-		pos += len;
-		written += len;
-		left -= len;
-		if (left)
-			goto more;
+		ceph_osdc_put_request(req);
+		if (ret == 0) {
+			pos += len;
+			written += len;
+
+			if (pos > i_size_read(inode)) {
+				check_caps = ceph_inode_set_size(inode, pos);
+				if (check_caps)
+					ceph_check_caps(ceph_inode(inode),
+							CHECK_CAPS_AUTHONLY,
+							NULL);
+			}
+		} else
+			break;
+	}
 
+	if (ret != -EOLDSNAPC && written > 0) {
 		ret = written;
-		*offset = pos;
-		if (pos > i_size_read(inode))
-			check_caps = ceph_inode_set_size(inode, pos);
-		if (check_caps)
-			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
-					NULL);
+		iocb->ki_pos = pos;
 	}
 	return ret;
 }
@@ -611,61 +776,69 @@ out:
  *
  * Hmm, the sync read case isn't actually async... should it be?
  */
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			     unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *filp = iocb->ki_filp;
 	struct ceph_file_info *fi = filp->private_data;
-	loff_t *ppos = &iocb->ki_pos;
-	size_t len = iov->iov_len;
-	struct inode *inode = filp->f_dentry->d_inode;
+	size_t len = iocb->ki_nbytes;
+	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	void __user *base = iov->iov_base;
 	ssize_t ret;
 	int want, got = 0;
 	int checkeof = 0, read = 0;
 
-	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
-	__ceph_do_pending_vmtruncate(inode);
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_CACHE;
 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
 	if (ret < 0)
-		goto out;
-	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)len,
-	     ceph_cap_string(got));
+		return ret;
 
 	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+	    (fi->flags & CEPH_F_SYNC)) {
+
+		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		     ceph_cap_string(got));
+
 		/* hmm, this isn't really async... */
-		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
-	else
-		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+		ret = ceph_sync_read(iocb, to, &checkeof);
+	} else {
+		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		     ceph_cap_string(got));
 
-out:
+		ret = generic_file_read_iter(iocb, to);
+	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	ceph_put_cap_refs(ci, got);
 
 	if (checkeof && ret >= 0) {
-		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		int statret = ceph_do_getattr(inode,
+					      CEPH_STAT_CAP_SIZE);
 
 		/* hit EOF or hole? */
-		if (statret == 0 && *ppos < inode->i_size) {
-			dout("aio_read sync_read hit hole, reading more\n");
+		if (statret == 0 && iocb->ki_pos < inode->i_size &&
+			ret < len) {
+			dout("sync_read hit hole, ppos %lld < size %lld"
+			     ", reading more\n", iocb->ki_pos,
+			     inode->i_size);
+
+			iov_iter_advance(to, ret);
 			read += ret;
-			base += ret;
 			len -= ret;
 			checkeof = 0;
 			goto again;
 		}
 	}
+
 	if (ret >= 0)
 		ret += read;
 
@@ -682,95 +855,150 @@ out:
  *
  * If we are near ENOSPC, write synchronously.
  */
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
-		       unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct ceph_file_info *fi = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
-	loff_t endoff = pos + iov->iov_len;
-	int want, got = 0;
-	int ret, err;
+	ssize_t count = iov_iter_count(from), written = 0;
+	int err, want, got;
+	loff_t pos = iocb->ki_pos;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
+	mutex_lock(&inode->i_mutex);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = file->f_mapping->backing_dev_info;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+	iov_iter_truncate(from, count);
+
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+
+	err = file_update_time(file);
+	if (err)
+		goto out;
+
 retry_snap:
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
-		return -ENOSPC;
-	__ceph_do_pending_vmtruncate(inode);
-	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-	     inode->i_size);
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, count, inode->i_size);
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_BUFFER;
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
-	if (ret < 0)
+	got = 0;
+	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count);
+	if (err < 0)
 		goto out;
 
-	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
-	     ceph_cap_string(got));
+	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
 
 	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
-	    (iocb->ki_filp->f_flags & O_DIRECT) ||
-	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
-		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
-			&iocb->ki_pos);
-	} else {
-		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
-		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
-		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-			err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
-			if (err < 0)
-				ret = err;
+	    (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+		struct iov_iter data;
+		mutex_unlock(&inode->i_mutex);
+		/* we might need to revert back to that point */
+		data = *from;
+		if (file->f_flags & O_DIRECT)
+			written = ceph_sync_direct_write(iocb, &data);
+		else
+			written = ceph_sync_write(iocb, &data);
+		if (written == -EOLDSNAPC) {
+			dout("aio_write %p %llx.%llx %llu~%u"
+				"got EOLDSNAPC, retrying\n",
+				inode, ceph_vinop(inode),
+				pos, (unsigned)count);
+			mutex_lock(&inode->i_mutex);
+			goto retry_snap;
 		}
+		if (written > 0)
+			iov_iter_advance(from, written);
+	} else {
+		loff_t old_size = inode->i_size;
+		/*
+		 * No need to acquire the i_truncate_mutex. Because
+		 * the MDS revokes Fwb caps before sending truncate
+		 * message to us. We can't get Fwb cap while there
+		 * are pending vmtruncate. So write and vmtruncate
+		 * can not run at the same time
+		 */
+		written = generic_perform_write(file, from, pos);
+		if (likely(written >= 0))
+			iocb->ki_pos = pos + written;
+		if (inode->i_size > old_size)
+			ceph_fscache_update_objectsize(inode);
+		mutex_unlock(&inode->i_mutex);
 	}
-	if (ret >= 0) {
-		spin_lock(&inode->i_lock);
-		__ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
-		spin_unlock(&inode->i_lock);
+
+	if (written >= 0) {
+		int dirty;
+		spin_lock(&ci->i_ceph_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
 	}
 
-out:
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     inode, ceph_vinop(inode), pos, (unsigned)count,
 	     ceph_cap_string(got));
 	ceph_put_cap_refs(ci, got);
 
-	if (ret == -EOLDSNAPC) {
-		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
-		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
-		goto retry_snap;
+	if (written >= 0 &&
+	    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
+	     ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+		err = vfs_fsync_range(file, pos, pos + written - 1, 1);
+		if (err < 0)
+			written = err;
 	}
 
-	return ret;
+	goto out_unlocked;
+
+out:
+	mutex_unlock(&inode->i_mutex);
+out_unlocked:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
 }
 
 /*
  * llseek.  be sure to verify file size on SEEK_END.
  */
-static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret;
 
 	mutex_lock(&inode->i_mutex);
-	__ceph_do_pending_vmtruncate(inode);
-	switch (origin) {
-	case SEEK_END:
+
+	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
 		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
 		if (ret < 0) {
 			offset = ret;
 			goto out;
 		}
+	}
+
+	switch (whence) {
+	case SEEK_END:
 		offset += inode->i_size;
 		break;
 	case SEEK_CUR:
@@ -786,39 +1014,239 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
 		}
 		offset += file->f_pos;
 		break;
+	case SEEK_DATA:
+		if (offset >= inode->i_size) {
+			ret = -ENXIO;
+			goto out;
+		}
+		break;
+	case SEEK_HOLE:
+		if (offset >= inode->i_size) {
+			ret = -ENXIO;
+			goto out;
+		}
+		offset = inode->i_size;
+		break;
 	}
 
-	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
-		offset = -EINVAL;
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+static inline void ceph_zero_partial_page(
+	struct inode *inode, loff_t offset, unsigned size)
+{
+	struct page *page;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_lock_page(inode->i_mapping, index);
+	if (page) {
+		wait_on_page_writeback(page);
+		zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+				      loff_t length)
+{
+	loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+	if (offset < nearly) {
+		loff_t size = nearly - offset;
+		if (length < size)
+			size = length;
+		ceph_zero_partial_page(inode, offset, size);
+		offset += size;
+		length -= size;
+	}
+	if (length >= PAGE_CACHE_SIZE) {
+		loff_t size = round_down(length, PAGE_CACHE_SIZE);
+		truncate_pagecache_range(inode, offset, offset + size - 1);
+		offset += size;
+		length -= size;
+	}
+	if (length)
+		ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+				    loff_t offset, loff_t *length)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	int ret = 0;
+	loff_t zero = 0;
+	int op;
+
+	if (!length) {
+		op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+		length = &zero;
+	} else {
+		op = CEPH_OSD_OP_ZERO;
+	}
+
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+					ceph_vino(inode),
+					offset, length,
+					1, op,
+					CEPH_OSD_FLAG_WRITE |
+					CEPH_OSD_FLAG_ONDISK,
+					NULL, 0, 0, false);
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
 		goto out;
 	}
 
-	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
+	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+				&inode->i_mtime);
+
+	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+	if (!ret) {
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		if (ret == -ENOENT)
+			ret = 0;
 	}
+	ceph_osdc_put_request(req);
 
 out:
+	return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+	int ret = 0;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+	s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+	s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+	u64 object_set_size = object_size * stripe_count;
+	u64 nearly, t;
+
+	/* round offset up to next period boundary */
+	nearly = offset + object_set_size - 1;
+	t = nearly;
+	nearly -= do_div(t, object_set_size);
+
+	while (length && offset < nearly) {
+		loff_t size = length;
+		ret = ceph_zero_partial_object(inode, offset, &size);
+		if (ret < 0)
+			return ret;
+		offset += size;
+		length -= size;
+	}
+	while (length >= object_set_size) {
+		int i;
+		loff_t pos = offset;
+		for (i = 0; i < stripe_count; ++i) {
+			ret = ceph_zero_partial_object(inode, pos, NULL);
+			if (ret < 0)
+				return ret;
+			pos += stripe_unit;
+		}
+		offset += object_set_size;
+		length -= object_set_size;
+	}
+	while (length) {
+		loff_t size = length;
+		ret = ceph_zero_partial_object(inode, offset, &size);
+		if (ret < 0)
+			return ret;
+		offset += size;
+		length -= size;
+	}
+	return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+				loff_t offset, loff_t length)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
+	int want, got = 0;
+	int dirty;
+	int ret = 0;
+	loff_t endoff = 0;
+	loff_t size;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (ceph_snap(inode) != CEPH_NOSNAP) {
+		ret = -EROFS;
+		goto unlock;
+	}
+
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
+		!(mode & FALLOC_FL_PUNCH_HOLE)) {
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	size = i_size_read(inode);
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		endoff = offset + length;
+
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
+	if (ret < 0)
+		goto unlock;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		if (offset < size)
+			ceph_zero_pagecache_range(inode, offset, length);
+		ret = ceph_zero_objects(inode, offset, length);
+	} else if (endoff > size) {
+		truncate_pagecache_range(inode, size, -1);
+		if (ceph_inode_set_size(inode, endoff))
+			ceph_check_caps(ceph_inode(inode),
+				CHECK_CAPS_AUTHONLY, NULL);
+	}
+
+	if (!ret) {
+		spin_lock(&ci->i_ceph_lock);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+	}
+
+	ceph_put_cap_refs(ci, got);
+unlock:
 	mutex_unlock(&inode->i_mutex);
-	return offset;
+	return ret;
 }
 
 const struct file_operations ceph_file_fops = {
 	.open = ceph_open,
 	.release = ceph_release,
 	.llseek = ceph_llseek,
-	.read = do_sync_read,
-	.write = do_sync_write,
-	.aio_read = ceph_aio_read,
-	.aio_write = ceph_aio_write,
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = ceph_read_iter,
+	.write_iter = ceph_write_iter,
 	.mmap = ceph_mmap,
 	.fsync = ceph_fsync,
 	.lock = ceph_lock,
 	.flock = ceph_flock,
 	.splice_read = generic_file_splice_read,
-	.splice_write = generic_file_splice_write,
+	.splice_write = iter_file_splice_write,
 	.unlocked_ioctl = ceph_ioctl,
 	.compat_ioctl	= ceph_ioctl,
+	.fallocate	= ceph_fallocate,
 };
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b54c97da1c4..04c89c266ce 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,10 +9,12 @@
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
-#include <linux/pagevec.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
 
 #include "super.h"
 #include "mds_client.h"
+#include "cache.h"
 #include <linux/ceph/decode.h>
 
 /*
@@ -95,6 +97,8 @@ const struct inode_operations ceph_file_iops = {
 	.getxattr = ceph_getxattr,
 	.listxattr = ceph_listxattr,
 	.removexattr = ceph_removexattr,
+	.get_acl = ceph_get_acl,
+	.set_acl = ceph_set_acl,
 };
 
 
@@ -176,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
  * specified, copy the frag delegation info to the caller if
  * it is present.
  */
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
-		     struct ceph_inode_frag *pfrag,
-		     int *found)
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			      struct ceph_inode_frag *pfrag, int *found)
 {
 	u32 t = ceph_frag_make(0, 0);
 	struct ceph_inode_frag *frag;
@@ -188,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 	if (found)
 		*found = 0;
 
-	mutex_lock(&ci->i_fragtree_mutex);
 	while (1) {
 		WARN_ON(!ceph_frag_contains_value(t, v));
 		frag = __ceph_find_frag(ci, t);
@@ -217,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 	}
 	dout("choose_frag(%x) = %x\n", v, t);
 
-	mutex_unlock(&ci->i_fragtree_mutex);
 	return t;
 }
 
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+		     struct ceph_inode_frag *pfrag, int *found)
+{
+	u32 ret;
+	mutex_lock(&ci->i_fragtree_mutex);
+	ret = __ceph_choose_frag(ci, v, pfrag, found);
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return ret;
+}
+
 /*
  * Process dirfrag (delegation) info from the mds.  Include leaf
  * fragment in tree ONLY if ndist > 0.  Otherwise, only
@@ -234,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
 	u32 id = le32_to_cpu(dirinfo->frag);
 	int mds = le32_to_cpu(dirinfo->auth);
 	int ndist = le32_to_cpu(dirinfo->ndist);
+	int diri_auth = -1;
 	int i;
 	int err = 0;
 
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->i_auth_cap)
+		diri_auth = ci->i_auth_cap->mds;
+	spin_unlock(&ci->i_ceph_lock);
+
 	mutex_lock(&ci->i_fragtree_mutex);
-	if (ndist == 0) {
+	if (ndist == 0 && mds == diri_auth) {
 		/* no delegation info needed. */
 		frag = __ceph_find_frag(ci, id);
 		if (!frag)
@@ -283,6 +300,75 @@ out:
 	return err;
 }
 
+static int ceph_fill_fragtree(struct inode *inode,
+			      struct ceph_frag_tree_head *fragtree,
+			      struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *rb_node;
+	int i;
+	u32 id, nsplits;
+	bool update = false;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	nsplits = le32_to_cpu(fragtree->nsplits);
+	if (nsplits) {
+		i = prandom_u32() % nsplits;
+		id = le32_to_cpu(fragtree->splits[i].frag);
+		if (!__ceph_find_frag(ci, id))
+			update = true;
+	} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+		rb_node = rb_first(&ci->i_fragtree);
+		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+		if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+			update = true;
+	}
+	if (!update && dirinfo) {
+		id = le32_to_cpu(dirinfo->frag);
+		if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+			update = true;
+	}
+	if (!update)
+		goto out_unlock;
+
+	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+	rb_node = rb_first(&ci->i_fragtree);
+	for (i = 0; i < nsplits; i++) {
+		id = le32_to_cpu(fragtree->splits[i].frag);
+		frag = NULL;
+		while (rb_node) {
+			frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+			if (ceph_frag_compare(frag->frag, id) >= 0) {
+				if (frag->frag != id)
+					frag = NULL;
+				else
+					rb_node = rb_next(rb_node);
+				break;
+			}
+			rb_node = rb_next(rb_node);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+			frag = NULL;
+		}
+		if (!frag) {
+			frag = __get_or_create_frag(ci, id);
+			if (IS_ERR(frag))
+				continue;
+		}
+		frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	}
+	while (rb_node) {
+		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+		rb_node = rb_next(rb_node);
+		rb_erase(&frag->node, &ci->i_fragtree);
+		kfree(frag);
+	}
+out_unlock:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return 0;
+}
 
 /*
  * initialize a newly allocated inode.
@@ -298,10 +384,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	dout("alloc_inode %p\n", &ci->vfs_inode);
 
+	spin_lock_init(&ci->i_ceph_lock);
+
 	ci->i_version = 0;
 	ci->i_time_warp_seq = 0;
 	ci->i_ceph_flags = 0;
-	ci->i_release_count = 0;
+	atomic_set(&ci->i_release_count, 1);
+	atomic_set(&ci->i_complete_count, 0);
 	ci->i_symlink = NULL;
 
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -332,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_hold_caps_min = 0;
 	ci->i_hold_caps_max = 0;
 	INIT_LIST_HEAD(&ci->i_cap_delay_list);
-	ci->i_cap_exporting_mds = 0;
-	ci->i_cap_exporting_mseq = 0;
-	ci->i_cap_exporting_issued = 0;
 	INIT_LIST_HEAD(&ci->i_cap_snaps);
 	ci->i_head_snapc = NULL;
 	ci->i_snap_caps = 0;
@@ -342,6 +428,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
 		ci->i_nr_by_mode[i] = 0;
 
+	mutex_init(&ci->i_truncate_mutex);
 	ci->i_truncate_seq = 0;
 	ci->i_truncate_size = 0;
 	ci->i_truncate_pending = 0;
@@ -355,6 +442,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_rd_ref = 0;
 	ci->i_rdcache_ref = 0;
 	ci->i_wr_ref = 0;
+	ci->i_wb_ref = 0;
 	ci->i_wrbuffer_ref = 0;
 	ci->i_wrbuffer_ref_head = 0;
 	ci->i_shared_gen = 0;
@@ -374,6 +462,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
 
+	ceph_fscache_inode_init(ci);
+
 	return &ci->vfs_inode;
 }
 
@@ -382,7 +472,6 @@ static void ceph_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ceph_inode_cachep, ci);
 }
 
@@ -394,11 +483,13 @@ void ceph_destroy_inode(struct inode *inode)
 
 	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
 
+	ceph_fscache_unregister_inode_cookie(ci);
+
 	ceph_queue_caps_release(inode);
 
 	/*
 	 * we may still have a snap_realm reference if there are stray
-	 * caps in i_cap_exporting_issued or i_snap_caps.
+	 * caps in i_snap_caps.
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
@@ -428,6 +519,15 @@ void ceph_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, ceph_i_callback);
 }
 
+int ceph_drop_inode(struct inode *inode)
+{
+	/*
+	 * Positve dentry and corresponding inode are always accompanied
+	 * in MDS reply. So no need to keep inode in the cache after
+	 * dropping all its aliases.
+	 */
+	return 1;
+}
 
 /*
  * Helpers to fill in size, ctime, mtime, and atime.  We have to be
@@ -453,16 +553,20 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			dout("truncate_seq %u -> %u\n",
 			     ci->i_truncate_seq, truncate_seq);
 			ci->i_truncate_seq = truncate_seq;
+
+			/* the MDS should have revoked these caps */
+			WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
+					       CEPH_CAP_FILE_RD |
+					       CEPH_CAP_FILE_WR |
+					       CEPH_CAP_FILE_LAZYIO));
 			/*
 			 * If we hold relevant caps, or in the case where we're
 			 * not the only client referencing this file and we
 			 * don't hold those caps, then we need to check whether
 			 * the file is either opened or mmaped
 			 */
-			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
-				       CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
-				       CEPH_CAP_FILE_EXCL|
-				       CEPH_CAP_FILE_LAZYIO)) ||
+			if ((issued & (CEPH_CAP_FILE_CACHE|
+				       CEPH_CAP_FILE_BUFFER)) ||
 			    mapping_mapped(inode->i_mapping) ||
 			    __ceph_caps_file_wanted(ci)) {
 				ci->i_truncate_pending++;
@@ -476,6 +580,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 		     truncate_size);
 		ci->i_truncate_size = truncate_size;
 	}
+
+	if (queue_trunc)
+		ceph_fscache_invalidate(inode);
+
 	return queue_trunc;
 }
 
@@ -556,20 +664,26 @@ static int fill_inode(struct inode *inode,
 		      unsigned long ttl_from, int cap_fmode,
 		      struct ceph_cap_reservation *caps_reservation)
 {
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_mds_reply_inode *info = iinfo->in;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int i;
-	int issued, implemented;
+	int issued = 0, implemented, new_issued;
 	struct timespec mtime, atime, ctime;
-	u32 nsplits;
 	struct ceph_buffer *xattr_blob = NULL;
+	struct ceph_cap *new_cap = NULL;
 	int err = 0;
-	int queue_trunc = 0;
+	bool wake = false;
+	bool queue_trunc = false;
+	bool new_version = false;
 
 	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
 	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
 	     ci->i_version);
 
+	/* prealloc new cap struct */
+	if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+		new_cap = ceph_get_cap(mdsc, caps_reservation);
+
 	/*
 	 * prealloc xattr data, if it looks like we'll need it.  only
 	 * if len > 4 (meaning there are actually xattrs; the first 4
@@ -582,7 +696,7 @@ static int fill_inode(struct inode *inode,
 			       iinfo->xattr_len);
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 
 	/*
 	 * provided version will be odd if inode value is projected,
@@ -595,51 +709,60 @@ static int fill_inode(struct inode *inode,
 	 *   3    2     skip
 	 *   3    3     update
 	 */
-	if (le64_to_cpu(info->version) > 0 &&
-	    (ci->i_version & ~1) >= le64_to_cpu(info->version))
-		goto no_change;
+	if (ci->i_version == 0 ||
+	    ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+	     le64_to_cpu(info->version) > (ci->i_version & ~1)))
+		new_version = true;
 
 	issued = __ceph_caps_issued(ci, &implemented);
 	issued |= implemented | __ceph_caps_dirty(ci);
+	new_issued = ~issued & le32_to_cpu(info->cap.caps);
 
 	/* update inode */
 	ci->i_version = le64_to_cpu(info->version);
 	inode->i_version++;
 	inode->i_rdev = le32_to_cpu(info->rdev);
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
 
-	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+	if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(info->mode);
-		inode->i_uid = le32_to_cpu(info->uid);
-		inode->i_gid = le32_to_cpu(info->gid);
+		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
+		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
-		     inode->i_uid, inode->i_gid);
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
-	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
-		inode->i_nlink = le32_to_cpu(info->nlink);
-
-	/* be careful with mtime, atime, size */
-	ceph_decode_timespec(&atime, &info->atime);
-	ceph_decode_timespec(&mtime, &info->mtime);
-	ceph_decode_timespec(&ctime, &info->ctime);
-	queue_trunc = ceph_fill_file_size(inode, issued,
-					  le32_to_cpu(info->truncate_seq),
-					  le64_to_cpu(info->truncate_size),
-					  le64_to_cpu(info->size));
-	ceph_fill_file_time(inode, issued,
-			    le32_to_cpu(info->time_warp_seq),
-			    &ctime, &mtime, &atime);
-
-	/* only update max_size on auth cap */
-	if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
-	    ci->i_max_size != le64_to_cpu(info->max_size)) {
-		dout("max_size %lld -> %llu\n", ci->i_max_size,
-		     le64_to_cpu(info->max_size));
-		ci->i_max_size = le64_to_cpu(info->max_size);
+	if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+	    (issued & CEPH_CAP_LINK_EXCL) == 0)
+		set_nlink(inode, le32_to_cpu(info->nlink));
+
+	if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+		/* be careful with mtime, atime, size */
+		ceph_decode_timespec(&atime, &info->atime);
+		ceph_decode_timespec(&mtime, &info->mtime);
+		ceph_decode_timespec(&ctime, &info->ctime);
+		ceph_fill_file_time(inode, issued,
+				le32_to_cpu(info->time_warp_seq),
+				&ctime, &mtime, &atime);
 	}
 
-	ci->i_layout = info->layout;
-	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+	if (new_version ||
+	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		ci->i_layout = info->layout;
+		queue_trunc = ceph_fill_file_size(inode, issued,
+					le32_to_cpu(info->truncate_seq),
+					le64_to_cpu(info->truncate_size),
+					le64_to_cpu(info->size));
+		/* only update max_size on auth cap */
+		if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+		    ci->i_max_size != le64_to_cpu(info->max_size)) {
+			dout("max_size %lld -> %llu\n", ci->i_max_size,
+					le64_to_cpu(info->max_size));
+			ci->i_max_size = le64_to_cpu(info->max_size);
+		}
+	}
 
 	/* xattrs */
 	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -652,6 +775,7 @@ static int fill_inode(struct inode *inode,
 			memcpy(ci->i_xattrs.blob->vec.iov_base,
 			       iinfo->xattr_data, iinfo->xattr_len);
 		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		ceph_forget_all_cached_acls(inode);
 		xattr_blob = NULL;
 	}
 
@@ -674,20 +798,21 @@ static int fill_inode(struct inode *inode,
 	case S_IFLNK:
 		inode->i_op = &ceph_symlink_iops;
 		if (!ci->i_symlink) {
-			int symlen = iinfo->symlink_len;
+			u32 symlen = iinfo->symlink_len;
 			char *sym;
 
-			BUG_ON(symlen != inode->i_size);
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
+
+			err = -EINVAL;
+			if (WARN_ON(symlen != inode->i_size))
+				goto out;
 
 			err = -ENOMEM;
-			sym = kmalloc(symlen+1, GFP_NOFS);
+			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
 			if (!sym)
 				goto out;
-			memcpy(sym, iinfo->symlink, symlen);
-			sym[symlen] = 0;
 
-			spin_lock(&inode->i_lock);
+			spin_lock(&ci->i_ceph_lock);
 			if (!ci->i_symlink)
 				ci->i_symlink = sym;
 			else
@@ -706,44 +831,22 @@ static int fill_inode(struct inode *inode,
 		ci->i_rfiles = le64_to_cpu(info->rfiles);
 		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
 		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
-
-		/* set dir completion flag? */
-		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
-		    ceph_snap(inode) == CEPH_NOSNAP &&
-		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
-		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
-			dout(" marking %p complete (empty)\n", inode);
-			/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
-			ci->i_max_offset = 2;
-		}
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
 		       ceph_vinop(inode), inode->i_mode);
 	}
 
-no_change:
-	spin_unlock(&inode->i_lock);
-
-	/* queue truncate if we saw i_size decrease */
-	if (queue_trunc)
-		ceph_queue_vmtruncate(inode);
-
-	/* populate frag tree */
-	/* FIXME: move me up, if/when version reflects fragtree changes */
-	nsplits = le32_to_cpu(info->fragtree.nsplits);
-	mutex_lock(&ci->i_fragtree_mutex);
-	for (i = 0; i < nsplits; i++) {
-		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
-		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
-		if (IS_ERR(frag))
-			continue;
-		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
-		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	/* set dir completion flag? */
+	if (S_ISDIR(inode->i_mode) &&
+	    ci->i_files == 0 && ci->i_subdirs == 0 &&
+	    ceph_snap(inode) == CEPH_NOSNAP &&
+	    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+	    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+	    !__ceph_dir_is_complete(ci)) {
+		dout(" marking %p complete (empty)\n", inode);
+		__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
 	}
-	mutex_unlock(&ci->i_fragtree_mutex);
 
 	/* were we issued a capability? */
 	if (info->cap.caps) {
@@ -756,30 +859,41 @@ no_change:
 				     le32_to_cpu(info->cap.seq),
 				     le32_to_cpu(info->cap.mseq),
 				     le64_to_cpu(info->cap.realm),
-				     info->cap.flags,
-				     caps_reservation);
+				     info->cap.flags, &new_cap);
+			wake = true;
 		} else {
-			spin_lock(&inode->i_lock);
 			dout(" %p got snap_caps %s\n", inode,
 			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
 			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
 			if (cap_fmode >= 0)
 				__ceph_get_fmode(ci, cap_fmode);
-			spin_unlock(&inode->i_lock);
 		}
 	} else if (cap_fmode >= 0) {
-		pr_warning("mds issued no caps on %llx.%llx\n",
+		pr_warn("mds issued no caps on %llx.%llx\n",
 			   ceph_vinop(inode));
 		__ceph_get_fmode(ci, cap_fmode);
 	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	/* queue truncate if we saw i_size decrease */
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+
+	/* populate frag tree */
+	if (S_ISDIR(inode->i_mode))
+		ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
 
 	/* update delegation info? */
 	if (dirinfo)
 		ceph_fill_dirfrag(inode, dirinfo);
 
 	err = 0;
-
 out:
+	if (new_cap)
+		ceph_put_cap(mdsc, new_cap);
 	if (xattr_blob)
 		ceph_buffer_put(xattr_blob);
 	return err;
@@ -804,14 +918,14 @@ static void update_dentry_lease(struct dentry *dentry,
 		return;
 
 	spin_lock(&dentry->d_lock);
-	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
-	     dentry, le16_to_cpu(lease->mask), duration, ttl);
+	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
+	     dentry, duration, ttl);
 
 	/* make lease_rdcache_gen match directory */
 	dir = dentry->d_parent->d_inode;
 	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
 
-	if (lease->mask == 0)
+	if (duration == 0)
 		goto out_unlock;
 
 	if (di->lease_gen == session->s_cap_gen &&
@@ -836,37 +950,6 @@ out_unlock:
 }
 
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-	struct dentry *dir = dn->d_parent;
-	struct inode *inode = dn->d_parent->d_inode;
-	struct ceph_dentry_info *di;
-
-	BUG_ON(!inode);
-
-	di = ceph_dentry(dn);
-
-	spin_lock(&inode->i_lock);
-	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
-		spin_unlock(&inode->i_lock);
-		return;
-	}
-	di->offset = ceph_inode(inode)->i_max_offset++;
-	spin_unlock(&inode->i_lock);
-
-	spin_lock(&dir->d_lock);
-	spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
-	list_move(&dn->d_u.d_child, &dir->d_subdirs);
-	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
-	spin_unlock(&dn->d_lock);
-	spin_unlock(&dir->d_lock);
-}
-
-/*
  * splice a dentry to an inode.
  * caller must hold directory i_mutex for this to be safe.
  *
@@ -875,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
  * the caller) if we fail.
  */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-				    bool *prehash, bool set_offset)
+				    bool *prehash)
 {
 	struct dentry *realdn;
 
@@ -895,8 +978,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 	} else if (realdn) {
 		dout("dn %p (%d) spliced with %p (%d) "
 		     "inode %p ino %llx.%llx\n",
-		     dn, dn->d_count,
-		     realdn, realdn->d_count,
+		     dn, d_count(dn),
+		     realdn, d_count(realdn),
 		     realdn->d_inode, ceph_vinop(realdn->d_inode));
 		dput(dn);
 		dn = realdn;
@@ -907,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 	}
 	if ((!prehash || *prehash) && d_unhashed(dn))
 		d_rehash(dn);
-	if (set_offset)
-		ceph_set_dentry_offset(dn);
 out:
 	return dn;
 }
@@ -929,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 {
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct inode *in = NULL;
-	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-	int i = 0;
 	int err = 0;
 
 	dout("fill_trace %p is_dentry %d is_target %d\n", req,
@@ -982,11 +1061,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	if (rinfo->head->is_dentry) {
 		struct inode *dir = req->r_locked_dir;
 
-		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
-				 session, req->r_request_started, -1,
-				 &req->r_caps_reservation);
-		if (err < 0)
-			return err;
+		if (dir) {
+			err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1,
+					 &req->r_caps_reservation);
+			if (err < 0)
+				goto done;
+		} else {
+			WARN_ON_ONCE(1);
+		}
+
+		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+			struct qstr dname;
+			struct dentry *dn, *parent;
+
+			BUG_ON(!rinfo->head->is_target);
+			BUG_ON(req->r_dentry);
+
+			parent = d_find_any_alias(dir);
+			BUG_ON(!parent);
+
+			dname.name = rinfo->dname;
+			dname.len = rinfo->dname_len;
+			dname.hash = full_name_hash(dname.name, dname.len);
+			vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+			vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+			dn = d_lookup(parent, &dname);
+			dout("d_lookup on parent=%p name=%.*s got %p\n",
+			     parent, dname.len, dname.name, dn);
+
+			if (!dn) {
+				dn = d_alloc(parent, &dname);
+				dout("d_alloc %p '%.*s' = %p\n", parent,
+				     dname.len, dname.name, dn);
+				if (dn == NULL) {
+					dput(parent);
+					err = -ENOMEM;
+					goto done;
+				}
+				err = ceph_init_dentry(dn);
+				if (err < 0) {
+					dput(dn);
+					dput(parent);
+					goto done;
+				}
+			} else if (dn->d_inode &&
+				   (ceph_ino(dn->d_inode) != vino.ino ||
+				    ceph_snap(dn->d_inode) != vino.snap)) {
+				dout(" dn %p points to wrong inode %p\n",
+				     dn, dn->d_inode);
+				d_delete(dn);
+				dput(dn);
+				goto retry_lookup;
+			}
+
+			req->r_dentry = dn;
+			dput(parent);
+		}
+	}
+
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			goto done;
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in, &rinfo->targeti, NULL,
+				session, req->r_request_started,
+				(!req->r_aborted && rinfo->head->result == 0) ?
+				req->r_fmode : -1,
+				&req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+				in, ceph_vinop(in));
+			goto done;
+		}
 	}
 
 	/*
@@ -994,6 +1149,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	 * will have trouble splicing in the virtual snapdir later
 	 */
 	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    req->r_locked_dir &&
 	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
 					       fsc->mount_options->snapdir_name,
 					       req->r_dentry->d_name.len))) {
@@ -1021,14 +1177,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 
 		/* do we have a dn lease? */
 		have_lease = have_dir_cap ||
-			(le16_to_cpu(rinfo->dlease->mask) &
-			 CEPH_LOCK_DN);
-
+			le32_to_cpu(rinfo->dlease->duration_ms);
 		if (!have_lease)
 			dout("fill_trace  no dentry lease or dir cap\n");
 
 		/* rename? */
 		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			struct inode *olddir = req->r_old_dentry_dir;
+			BUG_ON(!olddir);
+
 			dout(" src %p '%.*s' dst %p '%.*s'\n",
 			     req->r_old_dentry,
 			     req->r_old_dentry->d_name.len,
@@ -1048,18 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			   rehashing bug in vfs_rename_dir */
 			ceph_invalidate_dentry_lease(dn);
 
-			/*
-			 * d_move() puts the renamed dentry at the end of
-			 * d_subdirs.  We need to assign it an appropriate
-			 * directory offset so we can behave when holding
-			 * I_COMPLETE.
-			 */
-			ceph_set_dentry_offset(req->r_old_dentry);
-			dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
+			/* d_move screws up sibling dentries' offsets */
+			ceph_dir_clear_complete(dir);
+			ceph_dir_clear_complete(olddir);
+
+			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
 			     ceph_dentry(req->r_old_dentry)->offset);
 
 			dn = req->r_old_dentry;  /* use old_dentry */
-			in = dn->d_inode;
 		}
 
 		/* null dentry? */
@@ -1081,107 +1234,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 		}
 
 		/* attach proper inode */
-		ininfo = rinfo->targeti.in;
-		vino.ino = le64_to_cpu(ininfo->ino);
-		vino.snap = le64_to_cpu(ininfo->snapid);
-		in = dn->d_inode;
-		if (!in) {
-			in = ceph_get_inode(sb, vino);
-			if (IS_ERR(in)) {
-				pr_err("fill_trace bad get_inode "
-				       "%llx.%llx\n", vino.ino, vino.snap);
-				err = PTR_ERR(in);
-				d_delete(dn);
-				goto done;
-			}
-			dn = splice_dentry(dn, in, &have_lease, true);
+		if (!dn->d_inode) {
+			ceph_dir_clear_complete(dir);
+			ihold(in);
+			dn = splice_dentry(dn, in, &have_lease);
 			if (IS_ERR(dn)) {
 				err = PTR_ERR(dn);
 				goto done;
 			}
 			req->r_dentry = dn;  /* may have spliced */
-			igrab(in);
-		} else if (ceph_ino(in) == vino.ino &&
-			   ceph_snap(in) == vino.snap) {
-			igrab(in);
-		} else {
+		} else if (dn->d_inode && dn->d_inode != in) {
 			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
-			     dn, in, ceph_ino(in), ceph_snap(in),
-			     vino.ino, vino.snap);
+			     dn, dn->d_inode, ceph_vinop(dn->d_inode),
+			     ceph_vinop(in));
 			have_lease = false;
-			in = NULL;
 		}
 
 		if (have_lease)
 			update_dentry_lease(dn, rinfo->dlease, session,
 					    req->r_request_started);
 		dout(" final dn %p\n", dn);
-		i++;
-	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-		   req->r_op == CEPH_MDS_OP_MKSNAP) {
+	} else if (!req->r_aborted &&
+		   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		    req->r_op == CEPH_MDS_OP_MKSNAP)) {
 		struct dentry *dn = req->r_dentry;
+		struct inode *dir = req->r_locked_dir;
 
 		/* fill out a snapdir LOOKUPSNAP dentry */
 		BUG_ON(!dn);
-		BUG_ON(!req->r_locked_dir);
-		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
-		ininfo = rinfo->targeti.in;
-		vino.ino = le64_to_cpu(ininfo->ino);
-		vino.snap = le64_to_cpu(ininfo->snapid);
-		in = ceph_get_inode(sb, vino);
-		if (IS_ERR(in)) {
-			pr_err("fill_inode get_inode badness %llx.%llx\n",
-			       vino.ino, vino.snap);
-			err = PTR_ERR(in);
-			d_delete(dn);
-			goto done;
-		}
+		BUG_ON(!dir);
+		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
 		dout(" linking snapped dir %p to dn %p\n", in, dn);
-		dn = splice_dentry(dn, in, NULL, true);
+		ceph_dir_clear_complete(dir);
+		ihold(in);
+		dn = splice_dentry(dn, in, NULL);
 		if (IS_ERR(dn)) {
 			err = PTR_ERR(dn);
 			goto done;
 		}
 		req->r_dentry = dn;  /* may have spliced */
-		igrab(in);
-		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
 	}
+done:
+	dout("fill_trace done err=%d\n", err);
+	return err;
+}
 
-	if (rinfo->head->is_target) {
-		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
+					   struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	int i, err = 0;
 
-		if (in == NULL || ceph_ino(in) != vino.ino ||
-		    ceph_snap(in) != vino.snap) {
-			in = ceph_get_inode(sb, vino);
-			if (IS_ERR(in)) {
-				err = PTR_ERR(in);
-				goto done;
-			}
-		}
-		req->r_target_inode = in;
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+		struct inode *in;
+		int rc;
 
-		err = fill_inode(in,
-				 &rinfo->targeti, NULL,
-				 session, req->r_request_started,
-				 (le32_to_cpu(rinfo->head->result) == 0) ?
-				 req->r_fmode : -1,
-				 &req->r_caps_reservation);
-		if (err < 0) {
-			pr_err("fill_inode badness %p %llx.%llx\n",
-			       in, ceph_vinop(in));
-			goto done;
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+		in = ceph_get_inode(req->r_dentry->d_sb, vino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			dout("new_inode badness got %d\n", err);
+			continue;
+		}
+		rc = fill_inode(in, &rinfo->dir_in[i], NULL, session,
+				req->r_request_started, -1,
+				&req->r_caps_reservation);
+		if (rc < 0) {
+			pr_err("fill_inode badness on %p got %d\n", in, rc);
+			err = rc;
+			continue;
 		}
 	}
 
-done:
-	dout("fill_trace done err=%d\n", err);
 	return err;
 }
 
-/*
- * Prepopulate our cache with readdir results, leases, etc.
- */
 int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			     struct ceph_mds_session *session)
 {
@@ -1190,11 +1323,26 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	struct qstr dname;
 	struct dentry *dn;
 	struct inode *in;
-	int err = 0, i;
+	int err = 0, ret, i;
 	struct inode *snapdir = NULL;
 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
 	struct ceph_dentry_info *di;
+	u64 r_readdir_offset = req->r_readdir_offset;
+	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+
+	if (rinfo->dir_dir &&
+	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+		dout("readdir_prepopulate got new frag %x -> %x\n",
+		     frag, le32_to_cpu(rinfo->dir_dir->frag));
+		frag = le32_to_cpu(rinfo->dir_dir->frag);
+		if (ceph_frag_is_leftmost(frag))
+			r_readdir_offset = 2;
+		else
+			r_readdir_offset = 0;
+	}
+
+	if (req->r_aborted)
+		return readdir_prepopulate_inodes_only(req, session);
 
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
 		snapdir = ceph_get_snapdir(parent->d_inode);
@@ -1208,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
 	}
 
+	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
 		struct ceph_vino vino;
 
@@ -1232,9 +1381,10 @@ retry_lookup:
 				err = -ENOMEM;
 				goto out;
 			}
-			err = ceph_init_dentry(dn);
-			if (err < 0) {
+			ret = ceph_init_dentry(dn);
+			if (ret < 0) {
 				dput(dn);
+				err = ret;
 				goto out;
 			}
 		} else if (dn->d_inode &&
@@ -1254,9 +1404,6 @@ retry_lookup:
 			spin_unlock(&parent->d_lock);
 		}
 
-		di = dn->d_fsdata;
-		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
 		/* inode */
 		if (dn->d_inode) {
 			in = dn->d_inode;
@@ -1264,31 +1411,44 @@ retry_lookup:
 			in = ceph_get_inode(parent->d_sb, vino);
 			if (IS_ERR(in)) {
 				dout("new_inode badness\n");
-				d_delete(dn);
+				d_drop(dn);
 				dput(dn);
 				err = PTR_ERR(in);
 				goto out;
 			}
-			dn = splice_dentry(dn, in, NULL, false);
-			if (IS_ERR(dn))
-				dn = NULL;
 		}
 
 		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
 			       req->r_request_started, -1,
 			       &req->r_caps_reservation) < 0) {
 			pr_err("fill_inode badness on %p\n", in);
+			if (!dn->d_inode)
+				iput(in);
+			d_drop(dn);
 			goto next_item;
 		}
-		if (dn)
-			update_dentry_lease(dn, rinfo->dir_dlease[i],
-					    req->r_session,
-					    req->r_request_started);
+
+		if (!dn->d_inode) {
+			dn = splice_dentry(dn, in, NULL);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				dn = NULL;
+				goto next_item;
+			}
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+
+		update_dentry_lease(dn, rinfo->dir_dlease[i],
+				    req->r_session,
+				    req->r_request_started);
 next_item:
 		if (dn)
 			dput(dn);
 	}
-	req->r_did_prepopulate = true;
+	if (err == 0)
+		req->r_did_prepopulate = true;
 
 out:
 	if (snapdir) {
@@ -1304,7 +1464,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int ret = 0;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
 	inode->i_size = size;
 	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
@@ -1314,7 +1474,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 	    (ci->i_reported_size << 1) < ci->i_max_size)
 		ret = 1;
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return ret;
 }
 
@@ -1324,12 +1484,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
  */
 void ceph_queue_writeback(struct inode *inode)
 {
+	ihold(inode);
 	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
 		       &ceph_inode(inode)->i_wb_work)) {
 		dout("ceph_queue_writeback %p\n", inode);
-		igrab(inode);
 	} else {
 		dout("ceph_queue_writeback %p failed\n", inode);
+		iput(inode);
 	}
 }
 
@@ -1349,55 +1510,13 @@ static void ceph_writeback_work(struct work_struct *work)
  */
 void ceph_queue_invalidate(struct inode *inode)
 {
+	ihold(inode);
 	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
 		       &ceph_inode(inode)->i_pg_inv_work)) {
 		dout("ceph_queue_invalidate %p\n", inode);
-		igrab(inode);
 	} else {
 		dout("ceph_queue_invalidate %p failed\n", inode);
-	}
-}
-
-/*
- * invalidate any pages that are not dirty or under writeback.  this
- * includes pages that are clean and mapped.
- */
-static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
-{
-	struct pagevec pvec;
-	pgoff_t next = 0;
-	int i;
-
-	pagevec_init(&pvec, 0);
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			pgoff_t index;
-			int skip_page =
-				(PageDirty(page) || PageWriteback(page));
-
-			if (!skip_page)
-				skip_page = !trylock_page(page);
-
-			/*
-			 * We really shouldn't be looking at the ->index of an
-			 * unlocked page.  But we're not allowed to lock these
-			 * pages.  So we rely upon nobody altering the ->index
-			 * of this (pinned-by-us) page.
-			 */
-			index = page->index;
-			if (index > next)
-				next = index;
-			next++;
-
-			if (skip_page)
-				continue;
-
-			generic_error_remove_page(mapping, page);
-			unlock_page(page);
-		}
-		pagevec_release(&pvec);
-		cond_resched();
+		iput(inode);
 	}
 }
 
@@ -1413,20 +1532,23 @@ static void ceph_invalidate_work(struct work_struct *work)
 	u32 orig_gen;
 	int check = 0;
 
-	spin_lock(&inode->i_lock);
+	mutex_lock(&ci->i_truncate_mutex);
+	spin_lock(&ci->i_ceph_lock);
 	dout("invalidate_pages %p gen %d revoking %d\n", inode,
 	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
 	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-		/* nevermind! */
-		spin_unlock(&inode->i_lock);
+		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+			check = 1;
+		spin_unlock(&ci->i_ceph_lock);
+		mutex_unlock(&ci->i_truncate_mutex);
 		goto out;
 	}
 	orig_gen = ci->i_rdcache_gen;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
-	ceph_invalidate_nondirty_pages(inode->i_mapping);
+	truncate_pagecache(inode, 0);
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (orig_gen == ci->i_rdcache_gen &&
 	    orig_gen == ci->i_rdcache_revoking) {
 		dout("invalidate_pages %p gen %d successful\n", inode,
@@ -1437,18 +1559,20 @@ static void ceph_invalidate_work(struct work_struct *work)
 		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
 		     inode, orig_gen, ci->i_rdcache_gen,
 		     ci->i_rdcache_revoking);
+		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+			check = 1;
 	}
-	spin_unlock(&inode->i_lock);
-
+	spin_unlock(&ci->i_ceph_lock);
+	mutex_unlock(&ci->i_truncate_mutex);
+out:
 	if (check)
 		ceph_check_caps(ci, 0, NULL);
-out:
 	iput(inode);
 }
 
 
 /*
- * called by trunc_wq; take i_mutex ourselves
+ * called by trunc_wq;
  *
  * We also truncate in a separate thread as well.
  */
@@ -1459,9 +1583,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
 	struct inode *inode = &ci->vfs_inode;
 
 	dout("vmtruncate_work %p\n", inode);
-	mutex_lock(&inode->i_mutex);
 	__ceph_do_pending_vmtruncate(inode);
-	mutex_unlock(&inode->i_mutex);
 	iput(inode);
 }
 
@@ -1473,19 +1595,19 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
+	ihold(inode);
+
 	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
 		       &ci->i_vmtruncate_work)) {
 		dout("ceph_queue_vmtruncate %p\n", inode);
-		igrab(inode);
 	} else {
 		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
 		     inode, ci->i_truncate_pending);
+		iput(inode);
 	}
 }
 
 /*
- * called with i_mutex held.
- *
  * Make sure any pending truncation is applied before doing anything
  * that may depend on it.
  */
@@ -1493,13 +1615,15 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 to;
-	int wrbuffer_refs, wake = 0;
+	int wrbuffer_refs, finish = 0;
 
+	mutex_lock(&ci->i_truncate_mutex);
 retry:
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_truncate_pending == 0) {
 		dout("__do_pending_vmtruncate %p none pending\n", inode);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
+		mutex_unlock(&ci->i_truncate_mutex);
 		return;
 	}
 
@@ -1510,32 +1634,39 @@ retry:
 	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
 		dout("__do_pending_vmtruncate %p flushing snaps first\n",
 		     inode);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		filemap_write_and_wait_range(&inode->i_data, 0,
 					     inode->i_sb->s_maxbytes);
 		goto retry;
 	}
 
+	/* there should be no reader or writer */
+	WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
+
 	to = ci->i_truncate_size;
 	wrbuffer_refs = ci->i_wrbuffer_ref;
 	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
 	     ci->i_truncate_pending, to);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
+
+	truncate_pagecache(inode, to);
 
-	truncate_inode_pages(inode->i_mapping, to);
+	spin_lock(&ci->i_ceph_lock);
+	if (to == ci->i_truncate_size) {
+		ci->i_truncate_pending = 0;
+		finish = 1;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	if (!finish)
+		goto retry;
 
-	spin_lock(&inode->i_lock);
-	ci->i_truncate_pending--;
-	if (ci->i_truncate_pending == 0)
-		wake = 1;
-	spin_unlock(&inode->i_lock);
+	mutex_unlock(&ci->i_truncate_mutex);
 
 	if (wrbuffer_refs == 0)
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
-	if (wake)
-		wake_up_all(&ci->i_cap_wq);
-}
 
+	wake_up_all(&ci->i_cap_wq);
+}
 
 /*
  * symlinks
@@ -1550,6 +1681,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
 static const struct inode_operations ceph_symlink_iops = {
 	.readlink = generic_readlink,
 	.follow_link = ceph_sym_follow_link,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
 };
 
 /*
@@ -1559,7 +1696,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct inode *parent_inode = dentry->d_parent->d_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1567,12 +1703,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	int release = 0, dirtied = 0;
 	int mask = 0;
 	int err = 0;
+	int inode_dirty_flags = 0;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
-	__ceph_do_pending_vmtruncate(inode);
-
 	err = inode_change_ok(inode, attr);
 	if (err != 0)
 		return err;
@@ -1582,32 +1717,36 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	issued = __ceph_caps_issued(ci, NULL);
 	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
 
 	if (ia_valid & ATTR_UID) {
 		dout("setattr %p uid %d -> %d\n", inode,
-		     inode->i_uid, attr->ia_uid);
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kuid(&init_user_ns, attr->ia_uid));
 		if (issued & CEPH_CAP_AUTH_EXCL) {
 			inode->i_uid = attr->ia_uid;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
-			   attr->ia_uid != inode->i_uid) {
-			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+			   !uid_eq(attr->ia_uid, inode->i_uid)) {
+			req->r_args.setattr.uid = cpu_to_le32(
+				from_kuid(&init_user_ns, attr->ia_uid));
 			mask |= CEPH_SETATTR_UID;
 			release |= CEPH_CAP_AUTH_SHARED;
 		}
 	}
 	if (ia_valid & ATTR_GID) {
 		dout("setattr %p gid %d -> %d\n", inode,
-		     inode->i_gid, attr->ia_gid);
+		     from_kgid(&init_user_ns, inode->i_gid),
+		     from_kgid(&init_user_ns, attr->ia_gid));
 		if (issued & CEPH_CAP_AUTH_EXCL) {
 			inode->i_gid = attr->ia_gid;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
-			   attr->ia_gid != inode->i_gid) {
-			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+			   !gid_eq(attr->ia_gid, inode->i_gid)) {
+			req->r_args.setattr.gid = cpu_to_le32(
+				from_kgid(&init_user_ns, attr->ia_gid));
 			mask |= CEPH_SETATTR_GID;
 			release |= CEPH_CAP_AUTH_SHARED;
 		}
@@ -1620,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
 			   attr->ia_mode != inode->i_mode) {
+			inode->i_mode = attr->ia_mode;
 			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
 			mask |= CEPH_SETATTR_MODE;
 			release |= CEPH_CAP_AUTH_SHARED;
@@ -1725,28 +1865,40 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
 
 	if (dirtied) {
-		__ceph_mark_dirty_caps(ci, dirtied);
+		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
 		inode->i_ctime = CURRENT_TIME;
 	}
 
 	release &= issued;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (inode_dirty_flags)
+		__mark_inode_dirty(inode, inode_dirty_flags);
+
+	if (ia_valid & ATTR_MODE) {
+		err = posix_acl_chmod(inode, attr->ia_mode);
+		if (err)
+			goto out_put;
+	}
 
 	if (mask) {
-		req->r_inode = igrab(inode);
+		req->r_inode = inode;
+		ihold(inode);
 		req->r_inode_drop = release;
 		req->r_args.setattr.mask = cpu_to_le32(mask);
 		req->r_num_caps = 1;
-		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
 	}
 	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
 	     ceph_cap_string(dirtied), mask);
 
 	ceph_mdsc_put_request(req);
-	__ceph_do_pending_vmtruncate(inode);
+	if (mask & CEPH_SETATTR_SIZE)
+		__ceph_do_pending_vmtruncate(inode);
 	return err;
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
+out_put:
 	ceph_mdsc_put_request(req);
 	return err;
 }
@@ -1774,7 +1926,8 @@ int ceph_do_getattr(struct inode *inode, int mask)
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
 	req->r_num_caps = 1;
 	req->r_args.getattr.mask = cpu_to_le32(mask);
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -1788,17 +1941,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
  * Check inode permissions.  We verify we have a valid value for
  * the AUTH cap, then call the generic handler.
  */
-int ceph_permission(struct inode *inode, int mask, unsigned int flags)
+int ceph_permission(struct inode *inode, int mask)
 {
 	int err;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
 
 	if (!err)
-		err = generic_permission(inode, mask, flags, NULL);
+		err = generic_permission(inode, mask);
 	return err;
 }
 
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8888c9ba68d..a822a6e5829 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,9 +1,8 @@
+#include <linux/ceph/ceph_debug.h>
 #include <linux/in.h>
 
 #include "super.h"
 #include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
-
 #include "ioctl.h"
 
 
@@ -16,18 +15,17 @@
  */
 static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 {
-	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
 	struct ceph_ioctl_layout l;
 	int err;
 
-	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
 	if (!err) {
 		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
 		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
 		l.object_size = ceph_file_layout_object_size(ci->i_layout);
 		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
-		l.preferred_osd =
-			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+		l.preferred_osd = (s32)-1;
 		if (copy_to_user(arg, &l, sizeof(l)))
 			return -EFAULT;
 	}
@@ -35,45 +33,84 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 	return err;
 }
 
+static long __validate_layout(struct ceph_mds_client *mdsc,
+			      struct ceph_ioctl_layout *l)
+{
+	int i, err;
+
+	/* validate striping parameters */
+	if ((l->object_size & ~PAGE_MASK) ||
+	    (l->stripe_unit & ~PAGE_MASK) ||
+	    (l->stripe_unit != 0 &&
+	     ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	mutex_lock(&mdsc->mutex);
+	err = -EINVAL;
+	for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+		if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
+			err = 0;
+			break;
+		}
+	mutex_unlock(&mdsc->mutex);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
-	int err, i;
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
+	struct ceph_ioctl_layout nl;
+	int err;
 
-	/* copy and validate */
 	if (copy_from_user(&l, arg, sizeof(l)))
 		return -EFAULT;
 
-	if ((l.object_size & ~PAGE_MASK) ||
-	    (l.stripe_unit & ~PAGE_MASK) ||
-	    !l.stripe_unit ||
-	    (l.object_size &&
-	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
-		return -EINVAL;
-
-	/* make sure it's a valid data pool */
-	if (l.data_pool > 0) {
-		mutex_lock(&mdsc->mutex);
-		err = -EINVAL;
-		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
-			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
-				err = 0;
-				break;
-			}
-		mutex_unlock(&mdsc->mutex);
-		if (err)
-			return err;
-	}
+	/* validate changed params against current layout */
+	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
+	if (err)
+		return err;
+
+	memset(&nl, 0, sizeof(nl));
+	if (l.stripe_count)
+		nl.stripe_count = l.stripe_count;
+	else
+		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+	if (l.stripe_unit)
+		nl.stripe_unit = l.stripe_unit;
+	else
+		nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+	if (l.object_size)
+		nl.object_size = l.object_size;
+	else
+		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	if (l.data_pool)
+		nl.data_pool = l.data_pool;
+	else
+		nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+
+	/* this is obsolete, and always -1 */
+	nl.preferred_osd = le64_to_cpu(-1);
+
+	err = __validate_layout(mdsc, &nl);
+	if (err)
+		return err;
 
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
 				       USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+
 	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
 
 	req->r_args.setlayout.layout.fl_stripe_unit =
@@ -83,10 +120,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	req->r_args.setlayout.layout.fl_object_size =
 		cpu_to_le32(l.object_size);
 	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
-	req->r_args.setlayout.layout.fl_pg_preferred =
-		cpu_to_le32(l.preferred_osd);
 
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
 	return err;
 }
@@ -99,43 +134,28 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
  */
 static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
-	int err, i;
+	int err;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	/* copy and validate */
 	if (copy_from_user(&l, arg, sizeof(l)))
 		return -EFAULT;
 
-	if ((l.object_size & ~PAGE_MASK) ||
-	    (l.stripe_unit & ~PAGE_MASK) ||
-	    !l.stripe_unit ||
-	    (l.object_size &&
-	        (unsigned)l.object_size % (unsigned)l.stripe_unit))
-		return -EINVAL;
-
-	/* make sure it's a valid data pool */
-	if (l.data_pool > 0) {
-		mutex_lock(&mdsc->mutex);
-		err = -EINVAL;
-		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
-			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
-				err = 0;
-				break;
-			}
-		mutex_unlock(&mdsc->mutex);
-		if (err)
-			return err;
-	}
+	err = __validate_layout(mdsc, &l);
+	if (err)
+		return err;
 
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
 				       USE_AUTH_MDS);
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
 
 	req->r_args.setlayout.layout.fl_stripe_unit =
 			cpu_to_le32(l.stripe_unit);
@@ -145,8 +165,6 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 			cpu_to_le32(l.object_size);
 	req->r_args.setlayout.layout.fl_pg_pool =
 			cpu_to_le32(l.data_pool);
-	req->r_args.setlayout.layout.fl_pg_preferred =
-			cpu_to_le32(l.preferred_osd);
 
 	err = ceph_mdsc_do_request(mdsc, inode, req);
 	ceph_mdsc_put_request(req);
@@ -160,22 +178,29 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 {
 	struct ceph_ioctl_dataloc dl;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	struct ceph_object_locator oloc;
+	struct ceph_object_id oid;
 	u64 len = 1, olen;
 	u64 tmp;
-	struct ceph_object_layout ol;
 	struct ceph_pg pgid;
+	int r;
 
 	/* copy and validate */
 	if (copy_from_user(&dl, arg, sizeof(dl)))
 		return -EFAULT;
 
 	down_read(&osdc->map_sem);
-	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
-				      &dl.object_no, &dl.object_offset, &olen);
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
+					  &dl.object_no, &dl.object_offset,
+					  &olen);
+	if (r < 0) {
+		up_read(&osdc->map_sem);
+		return -EIO;
+	}
 	dl.file_offset -= dl.object_offset;
 	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
 	dl.block_size = ceph_file_layout_su(ci->i_layout);
@@ -186,10 +211,16 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
-	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
-				osdc->osdmap);
 
-	pgid = ol.ol_pgid;
+	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+	ceph_oid_set_name(&oid, dl.object_name);
+
+	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+	if (r < 0) {
+		up_read(&osdc->map_sem);
+		return r;
+	}
+
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
@@ -211,15 +242,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 static long ceph_ioctl_lazyio(struct file *file)
 {
 	struct ceph_file_info *fi = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
 	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		ci->i_nr_by_mode[fi->fmode]--;
 		fi->fmode |= CEPH_FILE_MODE_LAZY;
 		ci->i_nr_by_mode[fi->fmode]++;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		dout("ioctl_layzio: file %p marked lazy\n", file);
 
 		ceph_check_caps(ci, 0, NULL);
@@ -229,6 +260,14 @@ static long ceph_ioctl_lazyio(struct file *file)
 	return 0;
 }
 
+static long ceph_ioctl_syncio(struct file *file)
+{
+	struct ceph_file_info *fi = file->private_data;
+
+	fi->flags |= CEPH_F_SYNC;
+	return 0;
+}
+
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -247,6 +286,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case CEPH_IOC_LAZYIO:
 		return ceph_ioctl_lazyio(file);
+
+	case CEPH_IOC_SYNCIO:
+		return ceph_ioctl_syncio(file);
 	}
 
 	return -ENOTTY;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 52e8fd74d45..c77028afb1e 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -6,10 +6,36 @@
 
 #define CEPH_IOCTL_MAGIC 0x97
 
-/* just use u64 to align sanely on all archs */
+/*
+ * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
+ * CEPH_IOC_SET_LAYOUT - set file layout
+ * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
+ *
+ * The file layout specifies how file data is striped over objects in
+ * the distributed object store, which object pool they belong to (if
+ * it differs from the default), and an optional 'preferred osd' to
+ * store them on.
+ *
+ * Files get a new layout based on the policy set on the containing
+ * directory or one of its ancestors.  The GET_LAYOUT ioctl will let
+ * you examine the layout for a file or the policy on a directory.
+ *
+ * SET_LAYOUT will let you set a layout on a newly created file.  This
+ * only works immediately after the file is created and before any
+ * data is written to it.
+ *
+ * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
+ * on a directory that will apply to any new files created in that
+ * directory (or any child directory that doesn't specify a layout of
+ * its own).
+ */
+
+/* use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
 	__u64 stripe_unit, stripe_count, object_size;
 	__u64 data_pool;
+
+	/* obsolete.  new values ignored, always return -1 */
 	__s64 preferred_osd;
 };
 
@@ -21,6 +47,8 @@ struct ceph_ioctl_layout {
 				   struct ceph_ioctl_layout)
 
 /*
+ * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
+ *
  * Extract identity, address of the OSD and object storing a given
  * file offset.
  */
@@ -39,6 +67,34 @@ struct ceph_ioctl_dataloc {
 #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
 				   struct ceph_ioctl_dataloc)
 
+/*
+ * CEPH_IOC_LAZYIO - relax consistency
+ *
+ * Normally Ceph switches to synchronous IO when multiple clients have
+ * the file open (and or more for write).  Reads and writes bypass the
+ * page cache and go directly to the OSD.  Setting this flag on a file
+ * descriptor will allow buffered IO for this file in cases where the
+ * application knows it won't interfere with other nodes (or doesn't
+ * care).
+ */
 #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
 
+/*
+ * CEPH_IOC_SYNCIO - force synchronous IO
+ *
+ * This ioctl sets a file flag that forces the synchronous IO that
+ * bypasses the page cache, even if it is not necessary.  This is
+ * essentially the opposite behavior of IOC_LAZYIO.  This forces the
+ * same read/write path as a file opened by multiple clients when one
+ * or more of those clients is opened for write.
+ *
+ * Note that this type of sync IO takes a different path than a file
+ * opened with O_SYNC/D_SYNC (writes hit the page cache and are
+ * immediately flushed on page boundaries).  It is very similar to
+ * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
+ * are not copied (user page must remain stable) and O_DIRECT writes
+ * have alignment restrictions (on the buffer and file offset).
+ */
+#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
+
 #endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 476b329867d..fbc39c47bac 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,28 +2,50 @@
 
 #include <linux/file.h>
 #include <linux/namei.h>
+#include <linux/random.h>
 
 #include "super.h"
 #include "mds_client.h"
 #include <linux/ceph/pagelist.h>
 
+static u64 lock_secret;
+
+static inline u64 secure_addr(void *addr)
+{
+	u64 v = lock_secret ^ (u64)(unsigned long)addr;
+	/*
+	 * Set the most significant bit, so that MDS knows the 'owner'
+	 * is sufficient to identify the owner of lock. (old code uses
+	 * both 'owner' and 'pid')
+	 */
+	v |= (1ULL << 63);
+	return v;
+}
+
+void __init ceph_flock_init(void)
+{
+	get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
 /**
  * Implement fcntl and flock locking functions.
  */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 			     int cmd, u8 wait, struct file_lock *fl)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_mds_client *mdsc =
-		ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct inode *inode = file_inode(file);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 	u64 length = 0;
+	u64 owner;
 
 	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
 
 	/* mds requires start and length rather than start and end */
 	if (LLONG_MAX == fl->fl_end)
@@ -31,26 +53,24 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 	else
 		length = fl->fl_end - fl->fl_start + 1;
 
-	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-	     "length: %llu, wait: %d, type`: %d", (int)lock_type,
-	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
-	     length, wait, fl->fl_type);
+	owner = secure_addr(fl->fl_owner);
 
+	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+	     "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+	     wait, fl->fl_type);
 
 	req->r_args.filelock_change.rule = lock_type;
 	req->r_args.filelock_change.type = cmd;
+	req->r_args.filelock_change.owner = cpu_to_le64(owner);
 	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
-	/* This should be adjusted, but I'm not sure if
-	   namespaces actually get id numbers*/
-	req->r_args.filelock_change.pid_namespace =
-		cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
 	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 	req->r_args.filelock_change.length = cpu_to_le64(length);
 	req->r_args.filelock_change.wait = wait;
 
 	err = ceph_mdsc_do_request(mdsc, inode, req);
 
-	if ( operation == CEPH_MDS_OP_GETFILELOCK){
+	if (operation == CEPH_MDS_OP_GETFILELOCK) {
 		fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 			fl->fl_type = F_RDLCK;
@@ -70,7 +90,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 	}
 	ceph_mdsc_put_request(req);
 	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-	     "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
+	     "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
 	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
 	     length, wait, fl->fl_type, err);
 	return err;
@@ -87,14 +107,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	u8 wait = 0;
 	u16 op = CEPH_MDS_OP_SETFILELOCK;
 
-	fl->fl_nspid = get_pid(task_tgid(current));
-	dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	/* No mandatory locks */
+	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	dout("ceph_lock, fl_owner: %p", fl->fl_owner);
 
 	/* set wait bit as appropriate, then make command as Ceph expects it*/
-	if (F_SETLKW == cmd)
-		wait = 1;
-	if (F_GETLK == cmd)
+	if (IS_GETLK(cmd))
 		op = CEPH_MDS_OP_GETFILELOCK;
+	else if (IS_SETLKW(cmd))
+		wait = 1;
 
 	if (F_RDLCK == fl->fl_type)
 		lock_cmd = CEPH_LOCK_SHARED;
@@ -105,20 +130,24 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
 	if (!err) {
-		if ( op != CEPH_MDS_OP_GETFILELOCK ){
+		if (op != CEPH_MDS_OP_GETFILELOCK) {
 			dout("mds locked, locking locally");
 			err = posix_lock_file(file, fl, NULL);
 			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-				/* undo! This should only happen if the kernel detects
-				 * local deadlock. */
+				/* undo! This should only happen if
+				 * the kernel detects local
+				 * deadlock. */
 				ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
 						  CEPH_LOCK_UNLOCK, 0, fl);
-				dout("got %d on posix_lock_file, undid lock", err);
+				dout("got %d on posix_lock_file, undid lock",
+				     err);
 			}
 		}
 
-	} else {
-		dout("mds returned error code %d", err);
+	} else if (err == -ERESTARTSYS) {
+		dout("undoing lock\n");
+		ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+				  CEPH_LOCK_UNLOCK, 0, fl);
 	}
 	return err;
 }
@@ -127,20 +156,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
 	u8 lock_cmd;
 	int err;
-	u8 wait = 1;
-
-	fl->fl_nspid = get_pid(task_tgid(current));
-	dout("ceph_flock, fl_pid:%d", fl->fl_pid);
-
-	/* set wait bit, then clear it out of cmd*/
-	if (cmd & LOCK_NB)
-		wait = 0;
-	cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
-	/* set command sequence that Ceph wants to see:
-	   shared lock, exclusive lock, or unlock */
-	if (LOCK_SH == cmd)
+	u8 wait = 0;
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	/* No mandatory locks */
+	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	dout("ceph_flock, fl_file: %p", fl->fl_file);
+
+	if (IS_SETLKW(cmd))
+		wait = 1;
+
+	if (F_RDLCK == fl->fl_type)
 		lock_cmd = CEPH_LOCK_SHARED;
-	else if (LOCK_EX == cmd)
+	else if (F_WRLCK == fl->fl_type)
 		lock_cmd = CEPH_LOCK_EXCL;
 	else
 		lock_cmd = CEPH_LOCK_UNLOCK;
@@ -155,14 +186,17 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 					  file, CEPH_LOCK_UNLOCK, 0, fl);
 			dout("got %d on flock_lock_file_wait, undid lock", err);
 		}
-	} else {
-		dout("mds error code %d", err);
+	} else if (err == -ERESTARTSYS) {
+		dout("undoing lock\n");
+		ceph_lock_message(CEPH_LOCK_FLOCK,
+				  CEPH_MDS_OP_SETFILELOCK,
+				  file, CEPH_LOCK_UNLOCK, 0, fl);
 	}
 	return err;
 }
 
 /**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
  * counter variables, so you can prepare pagelist metadata before calling
  * ceph_encode_locks.
  */
@@ -184,27 +218,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 }
 
 /**
- * Encode the flock and fcntl locks for the given inode into the pagelist.
- * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
- * sequential flock locks.
- * Must be called with lock_flocks() already held.
- * If we encounter more of a specific lock type than expected,
- * we return the value 1.
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with inode->i_lock already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
  */
-int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
-		      int num_fcntl_locks, int num_flock_locks)
+int ceph_encode_locks_to_buffer(struct inode *inode,
+				struct ceph_filelock *flocks,
+				int num_fcntl_locks, int num_flock_locks)
 {
 	struct file_lock *lock;
-	struct ceph_filelock cephlock;
 	int err = 0;
 	int seen_fcntl = 0;
 	int seen_flock = 0;
+	int l = 0;
 
 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
 	     num_fcntl_locks);
-	err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
-	if (err)
-		goto fail;
+
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_POSIX) {
 			++seen_fcntl;
@@ -212,19 +242,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 				err = -ENOSPC;
 				goto fail;
 			}
-			err = lock_to_ceph_filelock(lock, &cephlock);
+			err = lock_to_ceph_filelock(lock, &flocks[l]);
 			if (err)
 				goto fail;
-			err = ceph_pagelist_append(pagelist, &cephlock,
-					   sizeof(struct ceph_filelock));
+			++l;
 		}
-		if (err)
-			goto fail;
 	}
-
-	err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
-	if (err)
-		goto fail;
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_FLOCK) {
 			++seen_flock;
@@ -232,19 +255,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 				err = -ENOSPC;
 				goto fail;
 			}
-			err = lock_to_ceph_filelock(lock, &cephlock);
+			err = lock_to_ceph_filelock(lock, &flocks[l]);
 			if (err)
 				goto fail;
-			err = ceph_pagelist_append(pagelist, &cephlock,
-					   sizeof(struct ceph_filelock));
+			++l;
 		}
-		if (err)
-			goto fail;
 	}
 fail:
 	return err;
 }
 
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+			   struct ceph_pagelist *pagelist,
+			   int num_fcntl_locks, int num_flock_locks)
+{
+	int err = 0;
+	__le32 nlocks;
+
+	nlocks = cpu_to_le32(num_fcntl_locks);
+	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+	if (err)
+		goto out_fail;
+
+	err = ceph_pagelist_append(pagelist, flocks,
+				   num_fcntl_locks * sizeof(*flocks));
+	if (err)
+		goto out_fail;
+
+	nlocks = cpu_to_le32(num_flock_locks);
+	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+	if (err)
+		goto out_fail;
+
+	err = ceph_pagelist_append(pagelist,
+				   &flocks[num_fcntl_locks],
+				   num_flock_locks * sizeof(*flocks));
+out_fail:
+	return err;
+}
+
 /*
  * Given a pointer to a lock, convert it to a ceph filelock
  */
@@ -252,13 +307,11 @@ int lock_to_ceph_filelock(struct file_lock *lock,
 			  struct ceph_filelock *cephlock)
 {
 	int err = 0;
-
 	cephlock->start = cpu_to_le64(lock->fl_start);
 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 	cephlock->client = cpu_to_le64(0);
-	cephlock->pid = cpu_to_le64(lock->fl_pid);
-	cephlock->pid_namespace =
-	        cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
 
 	switch (lock->fl_type) {
 	case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a1ee8fa3a8e..92a2548278f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -10,6 +11,7 @@
 #include "super.h"
 #include "mds_client.h"
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/pagelist.h>
@@ -42,6 +44,7 @@
  */
 
 struct ceph_reconnect_state {
+	int nr_caps;
 	struct ceph_pagelist *pagelist;
 	bool flock;
 };
@@ -61,7 +64,7 @@ static const struct ceph_connection_operations mds_con_ops;
  */
 static int parse_reply_info_in(void **p, void *end,
 			       struct ceph_mds_reply_info_in *info,
-			       int features)
+			       u64 features)
 {
 	int err = -EIO;
 
@@ -96,7 +99,7 @@ bad:
  */
 static int parse_reply_info_trace(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  int features)
+				  u64 features)
 {
 	int err;
 
@@ -143,7 +146,7 @@ out_bad:
  */
 static int parse_reply_info_dir(void **p, void *end,
 				struct ceph_mds_reply_info_parsed *info,
-				int features)
+				u64 features)
 {
 	u32 num, i = 0;
 	int err;
@@ -163,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
 	if (num == 0)
 		goto done;
 
-	/* alloc large array */
-	info->dir_nr = num;
-	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
-			       sizeof(*info->dir_dname) +
-			       sizeof(*info->dir_dname_len) +
-			       sizeof(*info->dir_dlease),
-			       GFP_NOFS);
-	if (info->dir_in == NULL) {
-		err = -ENOMEM;
-		goto out_bad;
-	}
+	BUG_ON(!info->dir_in);
 	info->dir_dname = (void *)(info->dir_in + num);
 	info->dir_dname_len = (void *)(info->dir_dname + num);
 	info->dir_dlease = (void *)(info->dir_dname_len + num);
+	if ((unsigned long)(info->dir_dlease + num) >
+	    (unsigned long)info->dir_in + info->dir_buf_size) {
+		pr_err("dir contents are larger than expected\n");
+		WARN_ON(1);
+		goto bad;
+	}
 
+	info->dir_nr = num;
 	while (num) {
 		/* dentry */
 		ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -215,7 +215,7 @@ out_bad:
  */
 static int parse_reply_info_filelock(void **p, void *end,
 				     struct ceph_mds_reply_info_parsed *info,
-				     int features)
+				     u64 features)
 {
 	if (*p + sizeof(*info->filelock_reply) > end)
 		goto bad;
@@ -232,16 +232,45 @@ bad:
 }
 
 /*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  u64 features)
+{
+	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+		if (*p == end) {
+			info->has_create_ino = false;
+		} else {
+			info->has_create_ino = true;
+			info->ino = ceph_decode_64(p);
+		}
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
  * parse extra results
  */
 static int parse_reply_info_extra(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  int features)
+				  u64 features)
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else
+	else if (info->head->op == CEPH_MDS_OP_READDIR ||
+		 info->head->op == CEPH_MDS_OP_LSSNAP)
 		return parse_reply_info_dir(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_CREATE)
+		return parse_reply_info_create(p, end, info, features);
+	else
+		return -EIO;
 }
 
 /*
@@ -249,7 +278,7 @@ static int parse_reply_info_extra(void **p, void *end,
  */
 static int parse_reply_info(struct ceph_msg *msg,
 			    struct ceph_mds_reply_info_parsed *info,
-			    int features)
+			    u64 features)
 {
 	void *p, *end;
 	u32 len;
@@ -262,6 +291,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* trace */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_trace(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
@@ -270,6 +300,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_extra(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
@@ -294,7 +325,9 @@ out_bad:
 
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-	kfree(info->dir_in);
+	if (!info->dir_in)
+		return;
+	free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
 }
 
 
@@ -332,10 +365,10 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	dout("mdsc put_session %p %d -> %d\n", s,
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
-		if (s->s_authorizer)
-		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
-			     s->s_mdsc->fsc->client->monc.auth,
-			     s->s_authorizer);
+		if (s->s_auth.authorizer)
+			ceph_auth_destroy_authorizer(
+				s->s_mdsc->fsc->client->monc.auth,
+				s->s_auth.authorizer);
 		kfree(s);
 	}
 }
@@ -382,6 +415,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *s;
 
+	if (mds >= mdsc->mdsmap->m_max_mds)
+		return ERR_PTR(-EINVAL);
+
 	s = kzalloc(sizeof(*s), GFP_NOFS);
 	if (!s)
 		return ERR_PTR(-ENOMEM);
@@ -392,15 +428,13 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
-	s->s_con.private = s;
-	s->s_con.ops = &mds_con_ops;
-	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
-	s->s_con.peer_name.num = cpu_to_le64(mds);
+	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
 
-	spin_lock_init(&s->s_cap_lock);
+	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
-	s->s_cap_ttl = 0;
+	s->s_cap_ttl = jiffies - 1;
+
+	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
@@ -410,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
 	s->s_num_cap_releases = 0;
+	s->s_cap_reconnect = 0;
 	s->s_cap_iterator = NULL;
 	INIT_LIST_HEAD(&s->s_cap_releases);
 	INIT_LIST_HEAD(&s->s_cap_releases_done);
@@ -436,7 +471,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	mdsc->sessions[mds] = s;
 	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
 
-	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	return s;
 
@@ -476,29 +512,33 @@ void ceph_mdsc_release_request(struct kref *kref)
 	struct ceph_mds_request *req = container_of(kref,
 						    struct ceph_mds_request,
 						    r_kref);
+	destroy_reply_info(&req->r_reply_info);
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_reply) {
+	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-		destroy_reply_info(&req->r_reply_info);
-	}
 	if (req->r_inode) {
-		ceph_put_cap_refs(ceph_inode(req->r_inode),
-				  CEPH_CAP_PIN);
+		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
 		iput(req->r_inode);
 	}
 	if (req->r_locked_dir)
-		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
-				  CEPH_CAP_PIN);
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
 	if (req->r_target_inode)
 		iput(req->r_target_inode);
 	if (req->r_dentry)
 		dput(req->r_dentry);
-	if (req->r_old_dentry) {
-		ceph_put_cap_refs(
-			ceph_inode(req->r_old_dentry->d_parent->d_inode),
-			CEPH_CAP_PIN);
+	if (req->r_old_dentry)
 		dput(req->r_old_dentry);
+	if (req->r_old_dentry_dir) {
+		/*
+		 * track (and drop pins for) r_old_dentry_dir
+		 * separately, since r_old_dentry's d_parent may have
+		 * changed between the dir mutex being dropped and
+		 * this request being freed.
+		 */
+		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
+				  CEPH_CAP_PIN);
+		iput(req->r_old_dentry_dir);
 	}
 	kfree(req->r_path1);
 	kfree(req->r_path2);
@@ -578,6 +618,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 	if (dir) {
 		struct ceph_inode_info *ci = ceph_inode(dir);
 
+		ihold(dir);
 		spin_lock(&ci->i_unsafe_lock);
 		req->r_unsafe_dir = dir;
 		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,8 +639,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		spin_lock(&ci->i_unsafe_lock);
 		list_del_init(&req->r_unsafe_dir_item);
 		spin_unlock(&ci->i_unsafe_lock);
+
+		iput(req->r_unsafe_dir);
+		req->r_unsafe_dir = NULL;
 	}
 
+	complete_all(&req->r_safe_completion);
+
 	ceph_mdsc_put_request(req);
 }
 
@@ -611,8 +657,14 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-struct dentry *get_nonsnap_parent(struct dentry *dentry)
+static struct dentry *get_nonsnap_parent(struct dentry *dentry)
 {
+	/*
+	 * we don't need to worry about protecting the d_parent access
+	 * here because we never renaming inside the snapped namespace
+	 * except to resplice to another snapdir, and either the old or new
+	 * result is a valid result.
+	 */
 	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
 		dentry = dentry->d_parent;
 	return dentry;
@@ -648,7 +700,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	if (req->r_inode) {
 		inode = req->r_inode;
 	} else if (req->r_dentry) {
-		struct inode *dir = req->r_dentry->d_parent->d_inode;
+		/* ignore race with rename; old or new d_parent is okay */
+		struct dentry *parent = req->r_dentry->d_parent;
+		struct inode *dir = parent->d_inode;
 
 		if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
@@ -656,18 +710,18 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
 			/* direct snapped/virtual snapdir requests
 			 * based on parent dir inode */
-			struct dentry *dn =
-				get_nonsnap_parent(req->r_dentry->d_parent);
+			struct dentry *dn = get_nonsnap_parent(parent);
 			inode = dn->d_inode;
 			dout("__choose_mds using nonsnap parent %p\n", inode);
-		} else if (req->r_dentry->d_inode) {
+		} else {
 			/* dentry target */
 			inode = req->r_dentry->d_inode;
-		} else {
-			/* dir + name */
-			inode = dir;
-			hash = ceph_dentry_hash(req->r_dentry);
-			is_hash = true;
+			if (!inode || mode == USE_AUTH_MDS) {
+				/* dir + name */
+				inode = dir;
+				hash = ceph_dentry_hash(dir, req->r_dentry);
+				is_hash = true;
+			}
 		}
 	}
 
@@ -717,21 +771,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		}
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	cap = NULL;
 	if (mode == USE_AUTH_MDS)
 		cap = ci->i_auth_cap;
 	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
 	if (!cap) {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		goto random;
 	}
 	mds = cap->session->s_mds;
 	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
 	     inode, ceph_vinop(inode), mds,
 	     cap == ci->i_auth_cap ? "auth " : "", cap);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return mds;
 
 random:
@@ -749,7 +803,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
+			   false);
 	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
 		return NULL;
@@ -792,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
  *
  * called under mdsc->mutex
  */
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+	struct ceph_mds_session *session;
+
+	session = __ceph_lookup_mds_session(mdsc, target);
+	if (!session) {
+		session = register_session(mdsc, target);
+		if (IS_ERR(session))
+			return session;
+	}
+	if (session->s_state == CEPH_MDS_SESSION_NEW ||
+	    session->s_state == CEPH_MDS_SESSION_CLOSING)
+		__open_session(mdsc, session);
+
+	return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+	struct ceph_mds_session *session;
+
+	dout("open_export_target_session to mds%d\n", target);
+
+	mutex_lock(&mdsc->mutex);
+	session = __open_export_target_session(mdsc, target);
+	mutex_unlock(&mdsc->mutex);
+
+	return session;
+}
+
 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
 					  struct ceph_mds_session *session)
 {
 	struct ceph_mds_info *mi;
 	struct ceph_mds_session *ts;
 	int i, mds = session->s_mds;
-	int target;
 
 	if (mds >= mdsc->mdsmap->m_max_mds)
 		return;
+
 	mi = &mdsc->mdsmap->m_info[mds];
 	dout("open_export_target_sessions for mds%d (%d targets)\n",
 	     session->s_mds, mi->num_export_targets);
 
 	for (i = 0; i < mi->num_export_targets; i++) {
-		target = mi->export_targets[i];
-		ts = __ceph_lookup_mds_session(mdsc, target);
-		if (!ts) {
-			ts = register_session(mdsc, target);
-			if (IS_ERR(ts))
-				return;
-		}
-		if (session->s_state == CEPH_MDS_SESSION_NEW ||
-		    session->s_state == CEPH_MDS_SESSION_CLOSING)
-			__open_session(mdsc, session);
-		else
-			dout(" mds%d target mds%d %p is %s\n", session->s_mds,
-			     i, ts, session_state_name(ts->s_state));
-		ceph_put_mds_session(ts);
+		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
+		if (!IS_ERR(ts))
+			ceph_put_mds_session(ts);
 	}
 }
 
@@ -935,8 +1011,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
-	spin_lock(&inode->i_lock);
-	__ceph_remove_cap(cap);
+	spin_lock(&ci->i_ceph_lock);
+	__ceph_remove_cap(cap, false);
 	if (!__ceph_is_any_real_caps(ci)) {
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -968,7 +1044,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	while (drop--)
 		iput(inode);
 	return 0;
@@ -981,6 +1057,37 @@ static void remove_session_caps(struct ceph_mds_session *session)
 {
 	dout("remove_session_caps on %p\n", session);
 	iterate_session_caps(session, remove_session_caps_cb, NULL);
+
+	spin_lock(&session->s_cap_lock);
+	if (session->s_nr_caps > 0) {
+		struct super_block *sb = session->s_mdsc->fsc->sb;
+		struct inode *inode;
+		struct ceph_cap *cap, *prev = NULL;
+		struct ceph_vino vino;
+		/*
+		 * iterate_session_caps() skips inodes that are being
+		 * deleted, we need to wait until deletions are complete.
+		 * __wait_on_freeing_inode() is designed for the job,
+		 * but it is not exported, so use lookup inode function
+		 * to access it.
+		 */
+		while (!list_empty(&session->s_caps)) {
+			cap = list_entry(session->s_caps.next,
+					 struct ceph_cap, session_caps);
+			if (cap == prev)
+				break;
+			prev = cap;
+			vino = cap->ci->i_vino;
+			spin_unlock(&session->s_cap_lock);
+
+			inode = ceph_find_inode(sb, vino);
+			iput(inode);
+
+			spin_lock(&session->s_cap_lock);
+		}
+	}
+	spin_unlock(&session->s_cap_lock);
+
 	BUG_ON(session->s_nr_caps > 0);
 	BUG_ON(!list_empty(&session->s_cap_flushing));
 	cleanup_cap_releases(session);
@@ -999,10 +1106,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 
 	wake_up_all(&ci->i_cap_wq);
 	if (arg) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		ci->i_wanted_max_size = 0;
 		ci->i_requested_max_size = 0;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 	return 0;
 }
@@ -1051,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 	return 0;
 }
 
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session, u64 seq)
+{
+	struct ceph_msg *msg;
+
+	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+	     session->s_mds, session_state_name(session->s_state), seq);
+	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+
 /*
  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
  *
@@ -1063,8 +1185,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
 	int wake = 0;
 
 	spin_lock(&session->s_cap_lock);
-	was_stale = is_renew && (session->s_cap_ttl == 0 ||
-				 time_after_eq(jiffies, session->s_cap_ttl));
+	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
 
 	session->s_cap_ttl = session->s_renew_requested +
 		mdsc->mdsmap->m_session_timeout*HZ;
@@ -1130,31 +1251,36 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 {
 	struct ceph_mds_session *session = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int used, oissued, mine;
+	int used, wanted, oissued, mine;
 
 	if (session->s_trim_caps <= 0)
 		return -1;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	mine = cap->issued | cap->implemented;
 	used = __ceph_caps_used(ci);
+	wanted = __ceph_caps_file_wanted(ci);
 	oissued = __ceph_caps_issued_other(ci, cap);
 
-	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
 	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
-	     ceph_cap_string(used));
-	if (ci->i_dirty_caps)
-		goto out;   /* dirty caps */
-	if ((used & ~oissued) & mine)
+	     ceph_cap_string(used), ceph_cap_string(wanted));
+	if (cap == ci->i_auth_cap) {
+		if (ci->i_dirty_caps | ci->i_flushing_caps)
+			goto out;
+		if ((used | wanted) & CEPH_CAP_ANY_WR)
+			goto out;
+	}
+	if ((used | wanted) & ~oissued & mine)
 		goto out;   /* we need these caps */
 
 	session->s_trim_caps--;
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
-		__ceph_remove_cap(cap);
+		__ceph_remove_cap(cap, true);
 	} else {
 		/* try to drop referring dentries */
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		d_prune_aliases(inode);
 		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
 		     inode, cap, atomic_read(&inode->i_count));
@@ -1162,7 +1288,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	}
 
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return 0;
 }
 
@@ -1185,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 			trim_caps - session->s_trim_caps);
 		session->s_trim_caps = 0;
 	}
+
+	ceph_add_cap_releases(mdsc, session);
+	ceph_send_cap_releases(mdsc, session);
 	return 0;
 }
 
@@ -1225,7 +1354,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
 		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-				   GFP_NOFS);
+				   GFP_NOFS, false);
 		if (!msg)
 			goto out_unlocked;
 		dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1280,7 +1409,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 					   i_flushing_item);
 			struct inode *inode = &ci->vfs_inode;
 
-			spin_lock(&inode->i_lock);
+			spin_lock(&ci->i_ceph_lock);
 			if (ci->i_cap_flush_seq <= want_flush_seq) {
 				dout("check_cap_flush still flushing %p "
 				     "seq %lld <= %lld to mds%d\n", inode,
@@ -1288,7 +1417,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 				     session->s_mds);
 				ret = 0;
 			}
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 		}
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
@@ -1334,16 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
 	unsigned num;
 
 	dout("discard_cap_releases mds%d\n", session->s_mds);
-	spin_lock(&session->s_cap_lock);
 
-	/* zero out the in-progress message */
-	msg = list_first_entry(&session->s_cap_releases,
-			       struct ceph_msg, list_head);
-	head = msg->front.iov_base;
-	num = le32_to_cpu(head->num);
-	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
-	head->num = cpu_to_le32(0);
-	session->s_num_cap_releases += num;
+	if (!list_empty(&session->s_cap_releases)) {
+		/* zero out the in-progress message */
+		msg = list_first_entry(&session->s_cap_releases,
+					struct ceph_msg, list_head);
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n",
+		     session->s_mds, msg, num);
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		session->s_num_cap_releases += num;
+	}
 
 	/* requeue completed messages */
 	while (!list_empty(&session->s_cap_releases_done)) {
@@ -1360,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
 		msg->front.iov_len = sizeof(*head);
 		list_add(&msg->list_head, &session->s_cap_releases);
 	}
-
-	spin_unlock(&session->s_cap_lock);
 }
 
 /*
  * requests
  */
 
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+				    struct inode *dir)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+	size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+		      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+	int order, num_entries;
+
+	spin_lock(&ci->i_ceph_lock);
+	num_entries = ci->i_files + ci->i_subdirs;
+	spin_unlock(&ci->i_ceph_lock);
+	num_entries = max(num_entries, 1);
+	num_entries = min(num_entries, opt->max_readdir);
+
+	order = get_order(size * num_entries);
+	while (order >= 0) {
+		rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+							order);
+		if (rinfo->dir_in)
+			break;
+		order--;
+	}
+	if (!rinfo->dir_in)
+		return -ENOMEM;
+
+	num_entries = (PAGE_SIZE << order) / size;
+	num_entries = min(num_entries, opt->max_readdir);
+
+	rinfo->dir_buf_size = PAGE_SIZE << order;
+	req->r_num_caps = num_entries + 1;
+	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+	return 0;
+}
+
 /*
  * Create an mds request.
  */
@@ -1391,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	init_completion(&req->r_safe_completion);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 
+	req->r_stamp = CURRENT_TIME;
+
 	req->r_op = op;
 	req->r_direct_mode = mode;
 	return req;
@@ -1434,12 +1603,15 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
 	struct dentry *temp;
 	char *path;
 	int len, pos;
+	unsigned seq;
 
 	if (dentry == NULL)
 		return ERR_PTR(-EINVAL);
 
 retry:
 	len = 0;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
 	for (temp = dentry; !IS_ROOT(temp);) {
 		struct inode *inode = temp->d_inode;
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
@@ -1450,11 +1622,8 @@ retry:
 		else
 			len += 1 + temp->d_name.len;
 		temp = temp->d_parent;
-		if (temp == NULL) {
-			pr_err("build_path corrupt dentry %p\n", dentry);
-			return ERR_PTR(-EINVAL);
-		}
 	}
+	rcu_read_unlock();
 	if (len)
 		len--;  /* no leading '/' */
 
@@ -1463,32 +1632,35 @@ retry:
 		return ERR_PTR(-ENOMEM);
 	pos = len;
 	path[pos] = 0;	/* trailing null */
+	rcu_read_lock();
 	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
-		struct inode *inode = temp->d_inode;
+		struct inode *inode;
 
+		spin_lock(&temp->d_lock);
+		inode = temp->d_inode;
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
 			dout("build_path path+%d: %p SNAPDIR\n",
 			     pos, temp);
 		} else if (stop_on_nosnap && inode &&
 			   ceph_snap(inode) == CEPH_NOSNAP) {
+			spin_unlock(&temp->d_lock);
 			break;
 		} else {
 			pos -= temp->d_name.len;
-			if (pos < 0)
+			if (pos < 0) {
+				spin_unlock(&temp->d_lock);
 				break;
+			}
 			strncpy(path + pos, temp->d_name.name,
 				temp->d_name.len);
 		}
+		spin_unlock(&temp->d_lock);
 		if (pos)
 			path[--pos] = '/';
 		temp = temp->d_parent;
-		if (temp == NULL) {
-			pr_err("build_path corrupt dentry\n");
-			kfree(path);
-			return ERR_PTR(-EINVAL);
-		}
 	}
-	if (pos != 0) {
+	rcu_read_unlock();
+	if (pos != 0 || read_seqretry(&rename_lock, seq)) {
 		pr_err("build_path did not end path lookup where "
 		       "expected, namelen is %d, pos is %d\n", len, pos);
 		/* presumably this is only possible if racing with a
@@ -1502,7 +1674,7 @@ retry:
 	*base = ceph_ino(temp->d_inode);
 	*plen = len;
 	dout("build_path on %p %d built %llx '%.*s'\n",
-	     dentry, dentry->d_count, *base, len, path);
+	     dentry, d_count(dentry), *base, len, path);
 	return path;
 }
 
@@ -1567,10 +1739,10 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
 		     *ppath);
-	} else if (rpath) {
+	} else if (rpath || rino) {
 		*ino = rino;
 		*ppath = rpath;
-		*pathlen = strlen(rpath);
+		*pathlen = rpath ? strlen(rpath) : 0;
 		dout(" path %.*s\n", *pathlen, rpath);
 	}
 
@@ -1613,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	}
 
 	len = sizeof(*head) +
-		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+		sizeof(struct timespec);
 
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
@@ -1624,12 +1797,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (req->r_old_dentry_drop)
 		len += req->r_old_dentry->d_name.len;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
 	}
 
+	msg->hdr.version = 2;
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
 	head = msg->front.iov_base;
@@ -1638,8 +1812,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
 	head->op = cpu_to_le32(req->r_op);
-	head->caller_uid = cpu_to_le32(req->r_uid);
-	head->caller_gid = cpu_to_le32(req->r_gid);
+	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
+	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
 	head->args = req->r_args;
 
 	ceph_encode_filepath(&p, end, ino1, path1);
@@ -1666,12 +1840,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
 	head->num_releases = cpu_to_le16(releases);
 
+	/* time stamp */
+	ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
 	BUG_ON(p > end);
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
-	msg->pages = req->r_pages;
-	msg->nr_pages = req->r_num_pages;
+	if (req->r_data_len) {
+		/* outbound data set only by ceph_sync_setxattr() */
+		BUG_ON(!req->r_pages);
+		ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
+	}
+
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
 	msg->hdr.data_off = cpu_to_le16(0);
 
@@ -1785,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	int mds = -1;
 	int err = -EAGAIN;
 
-	if (req->r_err || req->r_got_result)
+	if (req->r_err || req->r_got_result) {
+		if (req->r_aborted)
+			__unregister_request(mdsc, req);
 		goto out;
+	}
 
 	if (req->r_timeout &&
 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
@@ -1856,10 +2040,16 @@ finish:
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head)
 {
-	struct ceph_mds_request *req, *nreq;
+	struct ceph_mds_request *req;
+	LIST_HEAD(tmp_list);
 
-	list_for_each_entry_safe(req, nreq, head, r_wait) {
+	list_splice_init(head, &tmp_list);
+
+	while (!list_empty(&tmp_list)) {
+		req = list_entry(tmp_list.next,
+				 struct ceph_mds_request, r_wait);
 		list_del_init(&req->r_wait);
+		dout(" wake request %p tid %llu\n", req, req->r_tid);
 		__do_request(mdsc, req);
 	}
 }
@@ -1913,10 +2103,9 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
 	if (req->r_locked_dir)
 		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
-	if (req->r_old_dentry)
-		ceph_get_cap_refs(
-			ceph_inode(req->r_old_dentry->d_parent->d_inode),
-			CEPH_CAP_PIN);
+	if (req->r_old_dentry_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
+				  CEPH_CAP_PIN);
 
 	/* issue */
 	mutex_lock(&mdsc->mutex);
@@ -1974,20 +2163,16 @@ out:
 }
 
 /*
- * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir's completeness, dentry lease state on an aborted MDS
  * namespace request.
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
 	struct inode *inode = req->r_locked_dir;
-	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
-	spin_lock(&inode->i_lock);
-	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-	ci->i_release_count++;
-	spin_unlock(&inode->i_lock);
+	dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
 
+	ceph_dir_clear_complete(inode);
 	if (req->r_dentry)
 		ceph_invalidate_dentry_lease(req->r_dentry);
 	if (req->r_old_dentry)
@@ -2040,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	/* dup? */
 	if ((req->r_got_unsafe && !head->safe) ||
 	    (req->r_got_safe && head->safe)) {
-		pr_warning("got a dup %s reply on %llu from mds%d\n",
+		pr_warn("got a dup %s reply on %llu from mds%d\n",
 			   head->safe ? "safe" : "unsafe", tid, mds);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
 	if (req->r_got_safe && !head->safe) {
-		pr_warning("got unsafe after safe on %llu from mds%d\n",
+		pr_warn("got unsafe after safe on %llu from mds%d\n",
 			   tid, mds);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
@@ -2063,26 +2248,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	 */
 	if (result == -ESTALE) {
 		dout("got ESTALE on request %llu", req->r_tid);
-		if (!req->r_inode) {
-			/* do nothing; not an authority problem */
-		} else if (req->r_direct_mode != USE_AUTH_MDS) {
+		if (req->r_direct_mode != USE_AUTH_MDS) {
 			dout("not using auth, setting for that now");
 			req->r_direct_mode = USE_AUTH_MDS;
 			__do_request(mdsc, req);
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		} else  {
-			struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-			struct ceph_cap *cap = NULL;
-
-			if (req->r_session)
-				cap = ceph_get_cap_for_mds(ci,
-						   req->r_session->s_mds);
-
-			dout("already using auth");
-			if ((!cap || cap != ci->i_auth_cap) ||
-			    (cap->mseq != req->r_sent_on_mseq)) {
-				dout("but cap changed, so resending");
+			int mds = __choose_mds(mdsc, req);
+			if (mds >= 0 && mds != req->r_session->s_mds) {
+				dout("but auth changed, so resending");
 				__do_request(mdsc, req);
 				mutex_unlock(&mdsc->mutex);
 				goto out;
@@ -2095,7 +2270,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	if (head->safe) {
 		req->r_got_safe = true;
 		__unregister_request(mdsc, req);
-		complete_all(&req->r_safe_completion);
 
 		if (req->r_got_unsafe) {
 			/*
@@ -2146,8 +2320,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
-		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
-		    rinfo->dir_nr)
+		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+				    req->r_op == CEPH_MDS_OP_LSSNAP))
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
@@ -2297,10 +2471,10 @@ static void handle_session(struct ceph_mds_session *session,
 	case CEPH_SESSION_STALE:
 		pr_info("mds%d caps went stale, renewing\n",
 			session->s_mds);
-		spin_lock(&session->s_cap_lock);
+		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
-		session->s_cap_ttl = 0;
-		spin_unlock(&session->s_cap_lock);
+		session->s_cap_ttl = jiffies - 1;
+		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;
 
@@ -2308,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session,
 		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
 		break;
 
+	case CEPH_SESSION_FLUSHMSG:
+		send_flushmsg_ack(mdsc, session, seq);
+		break;
+
 	default:
 		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
 		WARN_ON(1);
@@ -2394,9 +2572,11 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	if (err)
 		goto out_free;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
+	cap->mseq = 0;       /* and migrate_seq */
+	cap->cap_gen = cap->session->s_cap_gen;
 
 	if (recon_state->flock) {
 		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2417,43 +2597,50 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		rec.v1.pathbase = cpu_to_le64(pathbase);
 		reclen = sizeof(rec.v1);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 
 	if (recon_state->flock) {
 		int num_fcntl_locks, num_flock_locks;
-		struct ceph_pagelist_cursor trunc_point;
-
-		ceph_pagelist_set_cursor(pagelist, &trunc_point);
-		do {
-			lock_flocks();
-			ceph_count_locks(inode, &num_fcntl_locks,
-					 &num_flock_locks);
-			rec.v2.flock_len = (2*sizeof(u32) +
-					    (num_fcntl_locks+num_flock_locks) *
-					    sizeof(struct ceph_filelock));
-			unlock_flocks();
-
-			/* pre-alloc pagelist */
-			ceph_pagelist_truncate(pagelist, &trunc_point);
-			err = ceph_pagelist_append(pagelist, &rec, reclen);
-			if (!err)
-				err = ceph_pagelist_reserve(pagelist,
-							    rec.v2.flock_len);
-
-			/* encode locks */
-			if (!err) {
-				lock_flocks();
-				err = ceph_encode_locks(inode,
-							pagelist,
-							num_fcntl_locks,
-							num_flock_locks);
-				unlock_flocks();
-			}
-		} while (err == -ENOSPC);
+		struct ceph_filelock *flocks;
+
+encode_again:
+		spin_lock(&inode->i_lock);
+		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+		spin_unlock(&inode->i_lock);
+		flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
+				 sizeof(struct ceph_filelock), GFP_NOFS);
+		if (!flocks) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		spin_lock(&inode->i_lock);
+		err = ceph_encode_locks_to_buffer(inode, flocks,
+						  num_fcntl_locks,
+						  num_flock_locks);
+		spin_unlock(&inode->i_lock);
+		if (err) {
+			kfree(flocks);
+			if (err == -ENOSPC)
+				goto encode_again;
+			goto out_free;
+		}
+		/*
+		 * number of encoded locks is stable, so copy to pagelist
+		 */
+		rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
+				    (num_fcntl_locks+num_flock_locks) *
+				    sizeof(struct ceph_filelock));
+		err = ceph_pagelist_append(pagelist, &rec, reclen);
+		if (!err)
+			err = ceph_locks_to_pagelist(flocks, pagelist,
+						     num_fcntl_locks,
+						     num_flock_locks);
+		kfree(flocks);
 	} else {
 		err = ceph_pagelist_append(pagelist, &rec, reclen);
 	}
 
+	recon_state->nr_caps++;
 out_free:
 	kfree(path);
 out_dput:
@@ -2481,6 +2668,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	struct rb_node *p;
 	int mds = session->s_mds;
 	int err = -ENOMEM;
+	int s_nr_caps;
 	struct ceph_pagelist *pagelist;
 	struct ceph_reconnect_state recon_state;
 
@@ -2491,7 +2679,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
 	if (!reply)
 		goto fail_nomsg;
 
@@ -2499,7 +2687,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 	session->s_seq = 0;
 
+	ceph_con_close(&session->s_con);
 	ceph_con_open(&session->s_con,
+		      CEPH_ENTITY_TYPE_MDS, mds,
 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	/* replay unsafe requests */
@@ -2510,20 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 
+	spin_lock(&session->s_gen_ttl_lock);
+	session->s_cap_gen++;
+	spin_unlock(&session->s_gen_ttl_lock);
+
+	spin_lock(&session->s_cap_lock);
+	/*
+	 * notify __ceph_remove_cap() that we are composing cap reconnect.
+	 * If a cap get released before being added to the cap reconnect,
+	 * __ceph_remove_cap() should skip queuing cap release.
+	 */
+	session->s_cap_reconnect = 1;
 	/* drop old cap expires; we're about to reestablish that state */
 	discard_cap_releases(mdsc, session);
+	spin_unlock(&session->s_cap_lock);
 
 	/* traverse this session's caps */
-	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+	s_nr_caps = session->s_nr_caps;
+	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
 	if (err)
 		goto fail;
 
+	recon_state.nr_caps = 0;
 	recon_state.pagelist = pagelist;
 	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
 	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
 	if (err < 0)
 		goto fail;
 
+	spin_lock(&session->s_cap_lock);
+	session->s_cap_reconnect = 0;
+	spin_unlock(&session->s_cap_lock);
+
 	/*
 	 * snaprealms.  we provide mds with the ino, seq (version), and
 	 * parent for all of our realms.  If the mds has any newer info,
@@ -2544,11 +2752,20 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 			goto fail;
 	}
 
-	reply->pagelist = pagelist;
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
+
+	/* raced with cap release? */
+	if (s_nr_caps != recon_state.nr_caps) {
+		struct page *page = list_first_entry(&pagelist->head,
+						     struct page, lru);
+		__le32 *addr = kmap_atomic(page);
+		*addr = cpu_to_le32(recon_state.nr_caps);
+		kunmap_atomic(addr);
+	}
+
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
-	reply->nr_pages = calc_pages_for(0, pagelist->length);
+	ceph_msg_data_add_pagelist(reply, pagelist);
 	ceph_con_send(&session->s_con, reply);
 
 	mutex_unlock(&session->s_mutex);
@@ -2604,7 +2821,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     session_state_name(s->s_state));
 
-		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+		if (i >= newmap->m_max_mds ||
+		    memcmp(ceph_mdsmap_get_addr(oldmap, i),
 			   ceph_mdsmap_get_addr(newmap, i),
 			   sizeof(struct ceph_entity_addr))) {
 			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
@@ -2691,14 +2909,12 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 {
 	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
-	struct ceph_inode_info *ci;
 	struct dentry *parent, *dentry;
 	struct ceph_dentry_info *di;
 	int mds = session->s_mds;
 	struct ceph_mds_lease *h = msg->front.iov_base;
 	u32 seq;
 	struct ceph_vino vino;
-	int mask;
 	struct qstr dname;
 	int release = 0;
 
@@ -2709,7 +2925,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 		goto bad;
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
-	mask = le16_to_cpu(h->mask);
 	seq = le32_to_cpu(h->seq);
 	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
 	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
@@ -2721,14 +2936,13 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 
 	/* lookup inode */
 	inode = ceph_find_inode(sb, vino);
-	dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
-	     ceph_lease_op_name(h->action), mask, vino.ino, inode,
+	dout("handle_lease %s, ino %llx %p %.*s\n",
+	     ceph_lease_op_name(h->action), vino.ino, inode,
 	     dname.len, dname.name);
 	if (inode == NULL) {
 		dout("handle_lease no inode %llx\n", vino.ino);
 		goto release;
 	}
-	ci = ceph_inode(inode);
 
 	/* dentry */
 	parent = d_find_alias(inode);
@@ -2747,7 +2961,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	di = ceph_dentry(dentry);
 	switch (h->action) {
 	case CEPH_MDS_LEASE_REVOKE:
-		if (di && di->lease_session == session) {
+		if (di->lease_session == session) {
 			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
 				h->seq = cpu_to_le32(di->lease_seq);
 			__ceph_mdsc_drop_dentry_lease(dentry);
@@ -2756,7 +2970,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 		break;
 
 	case CEPH_MDS_LEASE_RENEW:
-		if (di && di->lease_session == session &&
+		if (di->lease_session == session &&
 		    di->lease_gen == session->s_cap_gen &&
 		    di->lease_renew_from &&
 		    di->lease_renew_after == 0) {
@@ -2808,12 +3022,11 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 	dnamelen = dentry->d_name.len;
 	len += dnamelen;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
 	if (!msg)
 		return;
 	lease = msg->front.iov_base;
 	lease->action = action;
-	lease->mask = cpu_to_le16(1);
 	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
 	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
 	lease->seq = cpu_to_le32(seq);
@@ -2835,7 +3048,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
  * Pass @inode always, @dentry is optional.
  */
 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
-			     struct dentry *dentry, int mask)
+			     struct dentry *dentry)
 {
 	struct ceph_dentry_info *di;
 	struct ceph_mds_session *session;
@@ -2843,7 +3056,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 
 	BUG_ON(inode == NULL);
 	BUG_ON(dentry == NULL);
-	BUG_ON(mask == 0);
 
 	/* is dentry lease valid? */
 	spin_lock(&dentry->d_lock);
@@ -2853,8 +3065,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 	    di->lease_gen != di->lease_session->s_cap_gen ||
 	    !time_before(jiffies, dentry->d_time)) {
 		dout("lease_release inode %p dentry %p -- "
-		     "no lease on %d\n",
-		     inode, dentry, mask);
+		     "no lease\n",
+		     inode, dentry);
 		spin_unlock(&dentry->d_lock);
 		return;
 	}
@@ -2865,8 +3077,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 	__ceph_mdsc_drop_dentry_lease(dentry);
 	spin_unlock(&dentry->d_lock);
 
-	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
-	     inode, dentry, mask, session->s_mds);
+	dout("lease_release inode %p dentry %p to mds%d\n",
+	     inode, dentry, session->s_mds);
 	ceph_mdsc_lease_send_msg(session, inode, dentry,
 				 CEPH_MDS_LEASE_RELEASE, seq);
 	ceph_put_mds_session(session);
@@ -2979,8 +3191,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
-	if (mdsc->mdsmap == NULL)
+	if (mdsc->mdsmap == NULL) {
+		kfree(mdsc);
 		return -ENOMEM;
+	}
 
 	init_completion(&mdsc->safe_umount_waiters);
 	init_waitqueue_head(&mdsc->session_close_wq);
@@ -3002,6 +3216,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	spin_lock_init(&mdsc->snap_flush_lock);
 	mdsc->cap_flush_seq = 0;
 	INIT_LIST_HEAD(&mdsc->cap_dirty);
+	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
 	mdsc->num_cap_flushing = 0;
 	spin_lock_init(&mdsc->cap_dirty_lock);
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
@@ -3131,7 +3346,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 /*
  * true if all sessions are closed, or we force unmount
  */
-bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
 	int i, n = 0;
 
@@ -3215,9 +3430,15 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 {
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 
+	dout("mdsc_destroy %p\n", mdsc);
 	ceph_mdsc_stop(mdsc);
+
+	/* flush out any connection work with references to us */
+	ceph_msgr_flush();
+
 	fsc->mdsc = NULL;
 	kfree(mdsc);
+	dout("mdsc_destroy %p done\n", mdsc);
 }
 
 
@@ -3298,8 +3519,8 @@ static void con_put(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 
+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
 	ceph_put_mds_session(s);
-	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
 }
 
 /*
@@ -3311,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con)
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 
-	pr_warning("mds%d closed our session\n", s->s_mds);
+	pr_warn("mds%d closed our session\n", s->s_mds);
 	send_mds_reconnect(mdsc, s);
 }
 
@@ -3362,39 +3583,37 @@ out:
 /*
  * authentication
  */
-static int get_authorizer(struct ceph_connection *con,
-			  void **buf, int *len, int *proto,
-			  void **reply_buf, int *reply_len, int force_new)
+
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately.  Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+					int *proto, int force_new)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
-	int ret = 0;
-
-	if (force_new && s->s_authorizer) {
-		ac->ops->destroy_authorizer(ac, s->s_authorizer);
-		s->s_authorizer = NULL;
-	}
-	if (s->s_authorizer == NULL) {
-		if (ac->ops->create_authorizer) {
-			ret = ac->ops->create_authorizer(
-				ac, CEPH_ENTITY_TYPE_MDS,
-				&s->s_authorizer,
-				&s->s_authorizer_buf,
-				&s->s_authorizer_buf_len,
-				&s->s_authorizer_reply_buf,
-				&s->s_authorizer_reply_buf_len);
-			if (ret)
-				return ret;
-		}
-	}
+	struct ceph_auth_handshake *auth = &s->s_auth;
 
+	if (force_new && auth->authorizer) {
+		ceph_auth_destroy_authorizer(ac, auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (!auth->authorizer) {
+		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
+		if (ret)
+			return ERR_PTR(ret);
+	}
 	*proto = ac->protocol;
-	*buf = s->s_authorizer_buf;
-	*len = s->s_authorizer_buf_len;
-	*reply_buf = s->s_authorizer_reply_buf;
-	*reply_len = s->s_authorizer_reply_buf_len;
-	return 0;
+
+	return auth;
 }
 
 
@@ -3404,7 +3623,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
@@ -3413,12 +3632,32 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	if (ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
+static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr, int *skip)
+{
+	struct ceph_msg *msg;
+	int type = (int) le16_to_cpu(hdr->type);
+	int front_len = (int) le32_to_cpu(hdr->front_len);
+
+	if (con->in_msg)
+		return con->in_msg;
+
+	*skip = 0;
+	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+	if (!msg) {
+		pr_err("unable to allocate msg type %d len %d\n",
+		       type, front_len);
+		return NULL;
+	}
+
+	return msg;
+}
+
 static const struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
@@ -3427,6 +3666,7 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.invalidate_authorizer = invalidate_authorizer,
 	.peer_reset = peer_reset,
+	.alloc_msg = mds_alloc_msg,
 };
 
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba..e00737cf523 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -11,6 +11,7 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/mdsmap.h>
+#include <linux/ceph/auth.h>
 
 /*
  * Some lock dependencies:
@@ -20,7 +21,7 @@
  *
  *         mdsc->snap_rwsem
  *
- *         inode->i_lock
+ *         ci->i_ceph_lock
  *                 mdsc->snap_flush_lock
  *                 mdsc->cap_delay_lock
  *
@@ -66,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
 		/* for readdir results */
 		struct {
 			struct ceph_mds_reply_dirfrag *dir_dir;
+			size_t			      dir_buf_size;
 			int                           dir_nr;
 			char                          **dir_dname;
 			u32                           *dir_dname_len;
@@ -73,6 +75,12 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
 		};
+
+		/* for create results */
+		struct {
+			bool has_create_ino;
+			u64 ino;
+		};
 	};
 
 	/* encoded blob describing snapshot contexts for certain
@@ -113,17 +121,19 @@ struct ceph_mds_session {
 
 	struct ceph_connection s_con;
 
-	struct ceph_authorizer *s_authorizer;
-	void             *s_authorizer_buf, *s_authorizer_reply_buf;
-	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+	struct ceph_auth_handshake s_auth;
 
-	/* protected by s_cap_lock */
-	spinlock_t        s_cap_lock;
+	/* protected by s_gen_ttl_lock */
+	spinlock_t        s_gen_ttl_lock;
 	u32               s_cap_gen;  /* inc each time we get mds stale msg */
 	unsigned long     s_cap_ttl;  /* when session caps expire */
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
+	int		  s_cap_reconnect;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
 	struct list_head  s_cap_releases_done; /* ready to send */
 	struct ceph_cap  *s_cap_iterator;
@@ -171,6 +181,7 @@ struct ceph_mds_request {
 	struct inode *r_inode;              /* arg1 */
 	struct dentry *r_dentry;            /* arg1 */
 	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */
 	char *r_path1, *r_path2;
 	struct ceph_vino r_ino1, r_ino2;
 
@@ -181,8 +192,9 @@ struct ceph_mds_request {
 
 	union ceph_mds_request_args r_args;
 	int r_fmode;        /* file mode, if expecting cap */
-	uid_t r_uid;
-	gid_t r_gid;
+	kuid_t r_uid;
+	kgid_t r_gid;
+	struct timespec r_stamp;
 
 	/* for choosing which mds to send this request to */
 	int r_direct_mode;
@@ -278,6 +290,7 @@ struct ceph_mds_client {
 
 	u64               cap_flush_seq;
 	struct list_head  cap_dirty;        /* inodes with dirty caps */
+	struct list_head  cap_dirty_migrating; /* ...that are migration... */
 	int               num_cap_flushing; /* # caps we are flushing */
 	spinlock_t        cap_dirty_lock;   /* protects above items */
 	wait_queue_head_t cap_flushing_wq;
@@ -332,10 +345,11 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
 extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
 				    struct inode *inode,
-				    struct dentry *dn, int mask);
+				    struct dentry *dn);
 
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
-
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+					   struct inode *dir);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
@@ -372,6 +386,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
 				 struct ceph_msg *msg);
 
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 					  struct ceph_mds_session *session);
 
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a3..261531e55e9 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 {
 	int n = 0;
 	int i;
-	char r;
+
+	/* special case for one mds */
+	if (1 == m->m_max_mds && m->m_info[0].state > 0)
+		return 0;
 
 	/* count */
 	for (i = 0; i < m->m_max_mds; i++)
@@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 		return -1;
 
 	/* pick */
-	get_random_bytes(&r, 1);
-	n = r % n;
+	n = prandom_u32() % n;
 	i = 0;
 	for (i = 0; n > 0; i++, n--)
 		while (m->m_info[i].state <= 0)
@@ -59,6 +61,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		return ERR_PTR(-ENOMEM);
 
 	ceph_decode_16_safe(p, end, version, bad);
+	if (version > 3) {
+		pr_warn("got mdsmap version %d > 3, failing", version);
+		goto bad;
+	}
 
 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
 	m->m_epoch = ceph_decode_32(p);
@@ -86,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		u32 num_export_targets;
 		void *pexport_targets = NULL;
 		struct ceph_timespec laggy_since;
+		struct ceph_mds_info *info;
 
 		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
 		global_id = ceph_decode_64(p);
@@ -120,37 +127,40 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		     i+1, n, global_id, mds, inc,
 		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
-		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
-			m->m_info[mds].global_id = global_id;
-			m->m_info[mds].state = state;
-			m->m_info[mds].addr = addr;
-			m->m_info[mds].laggy =
-				(laggy_since.tv_sec != 0 ||
-				 laggy_since.tv_nsec != 0);
-			m->m_info[mds].num_export_targets = num_export_targets;
-			if (num_export_targets) {
-				m->m_info[mds].export_targets =
-					kcalloc(num_export_targets, sizeof(u32),
-						GFP_NOFS);
-				for (j = 0; j < num_export_targets; j++)
-					m->m_info[mds].export_targets[j] =
-					       ceph_decode_32(&pexport_targets);
-			} else {
-				m->m_info[mds].export_targets = NULL;
-			}
+
+		if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+			continue;
+
+		info = &m->m_info[mds];
+		info->global_id = global_id;
+		info->state = state;
+		info->addr = addr;
+		info->laggy = (laggy_since.tv_sec != 0 ||
+			       laggy_since.tv_nsec != 0);
+		info->num_export_targets = num_export_targets;
+		if (num_export_targets) {
+			info->export_targets = kcalloc(num_export_targets,
+						       sizeof(u32), GFP_NOFS);
+			if (info->export_targets == NULL)
+				goto badmem;
+			for (j = 0; j < num_export_targets; j++)
+				info->export_targets[j] =
+				       ceph_decode_32(&pexport_targets);
+		} else {
+			info->export_targets = NULL;
 		}
 	}
 
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
 	m->m_num_data_pg_pools = n;
-	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 	if (!m->m_data_pg_pools)
 		goto badmem;
-	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 	for (i = 0; i < n; i++)
-		m->m_data_pg_pools[i] = ceph_decode_32(p);
-	m->m_cas_pg_pool = ceph_decode_32(p);
+		m->m_data_pg_pools[i] = ceph_decode_64(p);
+	m->m_cas_pg_pool = ceph_decode_64(p);
 
 	/* ok, we don't care about the rest. */
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
@@ -164,7 +174,7 @@ bad:
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
 	ceph_mdsmap_destroy(m);
-	return ERR_PTR(-EINVAL);
+	return ERR_PTR(err);
 }
 
 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f40b9139e43..f01645a2775 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -206,7 +206,7 @@ void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
 		up_write(&mdsc->snap_rwsem);
 	} else {
 		spin_lock(&mdsc->snap_empty_lock);
-		list_add(&mdsc->snap_empty, &realm->empty_item);
+		list_add(&realm->empty_item, &mdsc->snap_empty);
 		spin_unlock(&mdsc->snap_empty_lock);
 	}
 }
@@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	struct ceph_snap_realm *parent = realm->parent;
 	struct ceph_snap_context *snapc;
 	int err = 0;
-	int i;
-	int num = realm->num_prior_parent_snaps + realm->num_snaps;
+	u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
 
 	/*
 	 * build parent context, if it hasn't been built.
@@ -321,28 +320,29 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	    realm->cached_context->seq == realm->seq &&
 	    (!parent ||
 	     realm->cached_context->seq >= parent->cached_context->seq)) {
-		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+		dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
 		     " (unchanged)\n",
 		     realm->ino, realm, realm->cached_context,
 		     realm->cached_context->seq,
-		     realm->cached_context->num_snaps);
+		     (unsigned int) realm->cached_context->num_snaps);
 		return 0;
 	}
 
 	/* alloc new snap context */
 	err = -ENOMEM;
-	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+	if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
 		goto fail;
-	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+	snapc = ceph_create_snap_context(num, GFP_NOFS);
 	if (!snapc)
 		goto fail;
-	atomic_set(&snapc->nref, 1);
 
 	/* build (reverse sorted) snap vector */
 	num = 0;
 	snapc->seq = realm->seq;
 	if (parent) {
-		/* include any of parent's snaps occuring _after_ my
+		u32 i;
+
+		/* include any of parent's snaps occurring _after_ my
 		   parent became my parent */
 		for (i = 0; i < parent->cached_context->num_snaps; i++)
 			if (parent->cached_context->snaps[i] >=
@@ -361,8 +361,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 
 	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
 	snapc->num_snaps = num;
-	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
-	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+	dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq,
+	     (unsigned int) snapc->num_snaps);
 
 	if (realm->cached_context)
 		ceph_put_snap_context(realm->cached_context);
@@ -402,9 +403,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm)
  * helper to allocate and decode an array of snapids.  free prior
  * instance, if any.
  */
-static int dup_array(u64 **dst, __le64 *src, int num)
+static int dup_array(u64 **dst, __le64 *src, u32 num)
 {
-	int i;
+	u32 i;
 
 	kfree(*dst);
 	if (num) {
@@ -446,9 +447,18 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		return;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	used = __ceph_caps_used(ci);
 	dirty = __ceph_caps_dirty(ci);
+
+	/*
+	 * If there is a write in progress, treat that as a dirty Fw,
+	 * even though it hasn't completed yet; by the time we finish
+	 * up this capsnap it will be.
+	 */
+	if (used & CEPH_CAP_FILE_WR)
+		dirty |= CEPH_CAP_FILE_WR;
+
 	if (__ceph_have_pending_cap_snap(ci)) {
 		/* there is no point in queuing multiple "pending" cap_snaps,
 		   as no new writes are allowed to start when pending, so any
@@ -456,15 +466,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		   cap_snap.  lucky us. */
 		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
-	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
-		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
-			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
+	} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+			    CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
 		struct ceph_snap_context *snapc = ci->i_head_snapc;
 
-		dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
-		     capsnap, snapc);
-		igrab(inode);
-		
+		/*
+		 * if we are a sync write, we may need to go to the snaprealm
+		 * to get the current snapc.
+		 */
+		if (!snapc)
+			snapc = ci->i_snap_realm->cached_context;
+
+		dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
+		     inode, capsnap, snapc, ceph_cap_string(dirty));
+		ihold(inode);
+
 		atomic_set(&capsnap->nref, 1);
 		capsnap->ci = ci;
 		INIT_LIST_HEAD(&capsnap->ci_item);
@@ -513,7 +529,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		kfree(capsnap);
 	}
 
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 }
 
 /*
@@ -522,7 +538,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
  *
  * If capsnap can now be flushed, add to snap_flush list, and return 1.
  *
- * Caller must hold i_lock.
+ * Caller must hold i_ceph_lock.
  */
 int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
@@ -722,11 +738,11 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
 		ci = list_first_entry(&mdsc->snap_flush_list,
 				struct ceph_inode_info, i_snap_flush_item);
 		inode = &ci->vfs_inode;
-		igrab(inode);
+		ihold(inode);
 		spin_unlock(&mdsc->snap_flush_lock);
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		__ceph_flush_snaps(ci, &session, 0);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		iput(inode);
 		spin_lock(&mdsc->snap_flush_lock);
 	}
@@ -832,7 +848,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 				continue;
 			ci = ceph_inode(inode);
 
-			spin_lock(&inode->i_lock);
+			spin_lock(&ci->i_ceph_lock);
 			if (!ci->i_snap_realm)
 				goto skip_inode;
 			/*
@@ -861,7 +877,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			oldrealm = ci->i_snap_realm;
 			ci->i_snap_realm = realm;
 			spin_unlock(&realm->inodes_with_caps_lock);
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 
 			ceph_get_snap_realm(mdsc, realm);
 			ceph_put_snap_realm(mdsc, oldrealm);
@@ -870,7 +886,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			continue;
 
 skip_inode:
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&ci->i_ceph_lock);
 			iput(inode);
 		}
 
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c80..51cc23e4811 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
 	case CEPH_MDS_STATE_BOOT:       return "up:boot";
 	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
 	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
 	case CEPH_MDS_STATE_CREATING:   return "up:creating";
 	case CEPH_MDS_STATE_STARTING:   return "up:starting";
 		/* up and in */
@@ -40,6 +41,8 @@ const char *ceph_session_op_name(int op)
 	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
 	case CEPH_SESSION_STALE: return "stale";
 	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
 	}
 	return "???";
 }
@@ -50,10 +53,14 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LOOKUP:  return "lookup";
 	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
 	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
+	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
 	case CEPH_MDS_OP_GETATTR:  return "getattr";
 	case CEPH_MDS_OP_SETXATTR: return "setxattr";
 	case CEPH_MDS_OP_SETATTR: return "setattr";
 	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
 	case CEPH_MDS_OP_READDIR: return "readdir";
 	case CEPH_MDS_OP_MKNOD: return "mknod";
 	case CEPH_MDS_OP_LINK: return "link";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a9e78b4a258..06150fd745a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -17,7 +17,9 @@
 
 #include "super.h"
 #include "mds_client.h"
+#include "cache.h"
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/auth.h>
@@ -70,17 +72,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * express utilization in terms of large blocks to avoid
 	 * overflow on 32-bit machines.
+	 *
+	 * NOTE: for the time being, we make bsize == frsize to humor
+	 * not-yet-ancient versions of glibc that are broken.
+	 * Someday, we will probably want to report a real block
+	 * size...  whatever that may mean for a network file system!
 	 */
 	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
 	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
-	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
-		(CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 
 	buf->f_files = le64_to_cpu(st.num_objects);
 	buf->f_ffree = -1;
 	buf->f_namelen = NAME_MAX;
-	buf->f_frsize = PAGE_CACHE_SIZE;
 
 	/* leave fsid little-endian, regardless of host endianness */
 	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
@@ -115,6 +121,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 enum {
 	Opt_wsize,
 	Opt_rsize,
+	Opt_rasize,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_cap_release_safety,
@@ -130,13 +137,24 @@ enum {
 	Opt_nodirstat,
 	Opt_rbytes,
 	Opt_norbytes,
+	Opt_asyncreaddir,
 	Opt_noasyncreaddir,
+	Opt_dcache,
+	Opt_nodcache,
 	Opt_ino32,
+	Opt_noino32,
+	Opt_fscache,
+	Opt_nofscache,
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	Opt_acl,
+#endif
+	Opt_noacl
 };
 
 static match_table_t fsopt_tokens = {
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
+	{Opt_rasize, "rasize=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -150,8 +168,18 @@ static match_table_t fsopt_tokens = {
 	{Opt_nodirstat, "nodirstat"},
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
+	{Opt_asyncreaddir, "asyncreaddir"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{Opt_dcache, "dcache"},
+	{Opt_nodcache, "nodcache"},
 	{Opt_ino32, "ino32"},
+	{Opt_noino32, "noino32"},
+	{Opt_fscache, "fsc"},
+	{Opt_nofscache, "nofsc"},
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	{Opt_acl, "acl"},
+#endif
+	{Opt_noacl, "noacl"},
 	{-1, NULL}
 };
 
@@ -197,6 +225,9 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_rsize:
 		fsopt->rsize = intval;
 		break;
+	case Opt_rasize:
+		fsopt->rasize = intval;
+		break;
 	case Opt_caps_wanted_delay_min:
 		fsopt->caps_wanted_delay_min = intval;
 		break;
@@ -224,12 +255,38 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_norbytes:
 		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
 		break;
+	case Opt_asyncreaddir:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
 	case Opt_noasyncreaddir:
 		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 		break;
+	case Opt_dcache:
+		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+		break;
+	case Opt_nodcache:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+		break;
 	case Opt_ino32:
 		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 		break;
+	case Opt_noino32:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+		break;
+	case Opt_fscache:
+		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+		break;
+	case Opt_nofscache:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+		break;
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	case Opt_acl:
+		fsopt->sb_flags |= MS_POSIXACL;
+		break;
+#endif
+	case Opt_noacl:
+		fsopt->sb_flags &= ~MS_POSIXACL;
+		break;
 	default:
 		BUG_ON(token);
 	}
@@ -282,7 +339,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 {
 	struct ceph_mount_options *fsopt;
 	const char *dev_name_end;
-	int err = -ENOMEM;
+	int err;
+
+	if (!dev_name || !*dev_name)
+		return -EINVAL;
 
 	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
 	if (!fsopt)
@@ -290,39 +350,54 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
 	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
 
-        fsopt->sb_flags = flags;
-        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+	fsopt->sb_flags = flags;
+	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 
-        fsopt->rsize = CEPH_RSIZE_DEFAULT;
-        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	fsopt->rsize = CEPH_RSIZE_DEFAULT;
+	fsopt->rasize = CEPH_RASIZE_DEFAULT;
+	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-        fsopt->congestion_kb = default_congestion_kb();
-	
-        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-        err = -EINVAL;
-        if (!dev_name)
-                goto out;
-        *path = strstr(dev_name, ":/");
-        if (*path == NULL) {
-                pr_err("device name is missing path (no :/ in %s)\n",
-                       dev_name);
-                goto out;
-        }
-	dev_name_end = *path;
-	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
+	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+	fsopt->congestion_kb = default_congestion_kb();
 
-	/* path on server */
-	*path += 2;
+	/*
+	 * Distinguish the server list from the path in "dev_name".
+	 * Internally we do not include the leading '/' in the path.
+	 *
+	 * "dev_name" will look like:
+	 *     <server_spec>[,<server_spec>...]:[<path>]
+	 * where
+	 *     <server_spec> is <ip>[:<port>]
+	 *     <path> is optional, but if present must begin with '/'
+	 */
+	dev_name_end = strchr(dev_name, '/');
+	if (dev_name_end) {
+		/* skip over leading '/' for path */
+		*path = dev_name_end + 1;
+	} else {
+		/* path is empty */
+		dev_name_end = dev_name + strlen(dev_name);
+		*path = dev_name_end;
+	}
+	err = -EINVAL;
+	dev_name_end--;		/* back up to ':' separator */
+	if (dev_name_end < dev_name || *dev_name_end != ':') {
+		pr_err("device name is missing path (no : separator in %s)\n",
+				dev_name);
+		goto out;
+	}
+	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 	dout("server path '%s'\n", *path);
 
-	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+	*popt = ceph_parse_options(options, dev_name, dev_name_end,
 				 parse_fsopt_token, (void *)fsopt);
-	if (err)
+	if (IS_ERR(*popt)) {
+		err = PTR_ERR(*popt);
 		goto out;
+	}
 
 	/* success */
 	*pfsopt = fsopt;
@@ -336,11 +411,11 @@ out:
 /**
  * ceph_show_options - Show mount options in /proc/mounts
  * @m: seq_file to write to
- * @mnt: mount descriptor
+ * @root: root of that (sub)tree
  */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int ceph_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
 	struct ceph_mount_options *fsopt = fsc->mount_options;
 	struct ceph_options *opt = fsc->client->options;
 
@@ -353,15 +428,13 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	if (opt->name)
 		seq_printf(m, ",name=%s", opt->name);
-	if (opt->secret)
+	if (opt->key)
 		seq_puts(m, ",secret=<hidden>");
 
 	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
 		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
 		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
-	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, ",osdkeepalivetimeout=%d",
 			   opt->osd_keepalive_timeout);
@@ -372,11 +445,28 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",norbytes");
 	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
 		seq_puts(m, ",noasyncreaddir");
+	if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+		seq_puts(m, ",dcache");
+	else
+		seq_puts(m, ",nodcache");
+	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
+		seq_puts(m, ",fsc");
+	else
+		seq_puts(m, ",nofsc");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	if (fsopt->sb_flags & MS_POSIXACL)
+		seq_puts(m, ",acl");
+	else
+		seq_puts(m, ",noacl");
+#endif
 
 	if (fsopt->wsize)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
 	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
+		seq_printf(m, ",rasize=%d", fsopt->rasize);
 	if (fsopt->congestion_kb != default_congestion_kb())
 		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
 	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
@@ -419,24 +509,29 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 /*
  * create a new fs client
  */
-struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 					struct ceph_options *opt)
 {
 	struct ceph_fs_client *fsc;
+	const u64 supported_features =
+		CEPH_FEATURE_FLOCK |
+		CEPH_FEATURE_DIRLAYOUTHASH;
+	const u64 required_features = 0;
+	int page_count;
+	size_t size;
 	int err = -ENOMEM;
 
 	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
 	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	fsc->client = ceph_create_client(opt, fsc);
+	fsc->client = ceph_create_client(opt, fsc, supported_features,
+					 required_features);
 	if (IS_ERR(fsc->client)) {
 		err = PTR_ERR(fsc->client);
 		goto fail;
 	}
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-	fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
-		CEPH_FEATURE_DIRLAYOUTHASH;
 	fsc->client->monc.want_mdsmap = 1;
 
 	fsc->mount_options = fsopt;
@@ -467,16 +562,24 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 
 	/* set up mempools */
 	err = -ENOMEM;
-	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+	page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+	size = sizeof (struct page *) * (page_count ? page_count : 1);
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 	if (!fsc->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
+	/* setup fscache */
+	if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
+	    (ceph_fscache_register_fs(fsc) != 0))
+		goto fail_fscache;
+
 	/* caps */
 	fsc->min_caps = fsopt->max_readdir;
 
 	return fsc;
 
+fail_fscache:
+	ceph_fscache_unregister_fs(fsc);
 fail_trunc_wq:
 	destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
@@ -492,10 +595,12 @@ fail:
 	return ERR_PTR(err);
 }
 
-void destroy_fs_client(struct ceph_fs_client *fsc)
+static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
 
+	ceph_fscache_unregister_fs(fsc);
+
 	destroy_workqueue(fsc->wb_wq);
 	destroy_workqueue(fsc->pg_inv_wq);
 	destroy_workqueue(fsc->trunc_wq);
@@ -530,6 +635,8 @@ static void ceph_inode_init_once(void *foo)
 
 static int __init init_caches(void)
 {
+	int error = -ENOMEM;
+
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 				      sizeof(struct ceph_inode_info),
 				      __alignof__(struct ceph_inode_info),
@@ -553,23 +660,33 @@ static int __init init_caches(void)
 	if (ceph_file_cachep == NULL)
 		goto bad_file;
 
-	return 0;
+	if ((error = ceph_fscache_register()))
+		goto bad_file;
 
+	return 0;
 bad_file:
 	kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
 	kmem_cache_destroy(ceph_cap_cachep);
 bad_cap:
 	kmem_cache_destroy(ceph_inode_cachep);
-	return -ENOMEM;
+	return error;
 }
 
 static void destroy_caches(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
 	kmem_cache_destroy(ceph_inode_cachep);
 	kmem_cache_destroy(ceph_cap_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
 	kmem_cache_destroy(ceph_file_cachep);
+
+	ceph_fscache_unregister();
 }
 
 
@@ -592,6 +709,7 @@ static const struct super_operations ceph_super_ops = {
 	.alloc_inode	= ceph_alloc_inode,
 	.destroy_inode	= ceph_destroy_inode,
 	.write_inode    = ceph_write_inode,
+	.drop_inode	= ceph_drop_inode,
 	.sync_fs        = ceph_sync_fs,
 	.put_super	= ceph_put_super,
 	.show_options   = ceph_show_options,
@@ -626,17 +744,25 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (err == 0) {
-		dout("open_root_inode success\n");
-		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-		    fsc->sb->s_root == NULL)
-			root = d_alloc_root(req->r_target_inode);
-		else
-			root = d_obtain_alias(req->r_target_inode);
+		struct inode *inode = req->r_target_inode;
 		req->r_target_inode = NULL;
+		dout("open_root_inode success\n");
+		if (ceph_ino(inode) == CEPH_INO_ROOT &&
+		    fsc->sb->s_root == NULL) {
+			root = d_make_root(inode);
+			if (!root) {
+				root = ERR_PTR(-ENOMEM);
+				goto out;
+			}
+		} else {
+			root = d_obtain_alias(inode);
+		}
+		ceph_init_dentry(root);
 		dout("open_root_inode success, root dentry is %p\n", root);
 	} else {
 		root = ERR_PTR(err);
 	}
+out:
 	ceph_mdsc_put_request(req);
 	return root;
 }
@@ -717,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 	s->s_flags = fsc->mount_options->sb_flags;
 	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 
+	s->s_xattr = ceph_xattr_handlers;
 	s->s_fs_info = fsc;
 	fsc->sb = s;
 
@@ -775,12 +902,16 @@ static int ceph_register_bdi(struct super_block *sb,
 {
 	int err;
 
-	/* set ra_pages based on rsize mount option? */
-	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+	/* set ra_pages based on rasize mount option? */
+	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
 		fsc->backing_dev_info.ra_pages =
-			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
+	else
+		fsc->backing_dev_info.ra_pages =
+			default_backing_dev_info.ra_pages;
+
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
 		sb->s_bdi = &fsc->backing_dev_info;
@@ -800,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	struct ceph_options *opt = NULL;
 
 	dout("ceph_mount\n");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	flags |= MS_POSIXACL;
+#endif
 	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
 	if (err < 0) {
 		res = ERR_PTR(err);
@@ -810,8 +945,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	fsc = create_fs_client(fsopt, opt);
 	if (IS_ERR(fsc)) {
 		res = ERR_CAST(fsc);
-		kfree(fsopt);
-		kfree(opt);
+		destroy_mount_options(fsopt);
+		ceph_destroy_options(opt);
 		goto out_final;
 	}
 
@@ -823,7 +958,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 
 	if (ceph_test_opt(fsc->client, NOSHARE))
 		compare_super = NULL;
-	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
+	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
 	if (IS_ERR(sb)) {
 		res = ERR_CAST(sb);
 		goto out;
@@ -880,6 +1015,7 @@ static struct file_system_type ceph_fs_type = {
 	.kill_sb	= ceph_kill_sb,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE,
 };
+MODULE_ALIAS_FS("ceph");
 
 #define _STRINGIFY(x) #x
 #define STRINGIFY(x) _STRINGIFY(x)
@@ -890,6 +1026,8 @@ static int __init init_ceph(void)
 	if (ret)
 		goto out;
 
+	ceph_flock_init();
+	ceph_xattr_init();
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
 		goto out_icache;
@@ -899,6 +1037,7 @@ static int __init init_ceph(void)
 	return 0;
 
 out_icache:
+	ceph_xattr_exit();
 	destroy_caches();
 out:
 	return ret;
@@ -908,6 +1047,7 @@ static void __exit exit_ceph(void)
 {
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
+	ceph_xattr_exit();
 	destroy_caches();
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 619fe719968..12b20744e38 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -13,21 +13,28 @@
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
+#include <linux/posix_acl.h>
 
 #include <linux/ceph/libceph.h>
 
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
+#endif
+
 /* f_type in struct statfs */
 #define CEPH_SUPER_MAGIC 0x00c36400
 
 /* large granularity for statfs utilization stats to facilitate
  * large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
 #define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
+#define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
 
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
 
@@ -36,7 +43,8 @@
 #define ceph_test_mount_opt(fsc, opt) \
 	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define CEPH_RSIZE_DEFAULT             (512*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT             0           /* max read size */
+#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -45,8 +53,9 @@ struct ceph_mount_options {
 	int flags;
 	int sb_flags;
 
-	int wsize;
-	int rsize;            /* max readahead */
+	int wsize;            /* max write size */
+	int rsize;            /* max read size */
+	int rasize;           /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
 	int caps_wanted_delay_min, caps_wanted_delay_max;
 	int cap_release_safety;
@@ -87,6 +96,11 @@ struct ceph_fs_client {
 	struct dentry *debugfs_bdi;
 	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 #endif
+
+#ifdef CONFIG_CEPH_FSCACHE
+	struct fscache_cookie *fscache;
+	struct workqueue_struct *revalidate_wq;
+#endif
 };
 
 
@@ -134,9 +148,9 @@ struct ceph_cap_snap {
 	int issued, dirty;
 	struct ceph_snap_context *context;
 
-	mode_t mode;
-	uid_t uid;
-	gid_t gid;
+	umode_t mode;
+	kuid_t uid;
+	kgid_t gid;
 
 	struct ceph_buffer *xattr_blob;
 	u64 xattr_version;
@@ -235,11 +249,14 @@ struct ceph_inode_xattrs_info {
 struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
 
+	spinlock_t i_ceph_lock;
+
 	u64 i_version;
 	u32 i_time_warp_seq;
 
 	unsigned i_ceph_flags;
-	unsigned long i_release_count;
+	atomic_t i_release_count;
+	atomic_t i_complete_count;
 
 	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
@@ -249,14 +266,13 @@ struct ceph_inode_info {
 	struct timespec i_rctime;
 	u64 i_rbytes, i_rfiles, i_rsubdirs;
 	u64 i_files, i_subdirs;
-	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
 
 	struct rb_root i_fragtree;
 	struct mutex i_fragtree_mutex;
 
 	struct ceph_inode_xattrs_info i_xattrs;
 
-	/* capabilities.  protected _both_ by i_lock and cap->session's
+	/* capabilities.  protected _both_ by i_ceph_lock and cap->session's
 	 * s_mutex. */
 	struct rb_root i_caps;           /* cap list */
 	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
@@ -271,9 +287,6 @@ struct ceph_inode_info {
 	unsigned long i_hold_caps_min; /* jiffies */
 	unsigned long i_hold_caps_max; /* jiffies */
 	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
-	int i_cap_exporting_mds;         /* to handle cap migration between */
-	unsigned i_cap_exporting_mseq;   /*  mds's. */
-	unsigned i_cap_exporting_issued;
 	struct ceph_cap_reservation i_cap_migration_resv;
 	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
 	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
@@ -282,6 +295,7 @@ struct ceph_inode_info {
 
 	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
 
+	struct mutex i_truncate_mutex;
 	u32 i_truncate_seq;        /* last truncate to smaller size */
 	u64 i_truncate_size;       /*  and the size we last truncated down to */
 	int i_truncate_pending;    /*  still need to call vmtruncate */
@@ -293,7 +307,7 @@ struct ceph_inode_info {
 
 	/* held references to caps */
 	int i_pin_ref;
-	int i_rd_ref, i_rdcache_ref, i_wr_ref;
+	int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
 	int i_wrbuffer_ref, i_wrbuffer_ref_head;
 	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
 	u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
@@ -313,6 +327,11 @@ struct ceph_inode_info {
 
 	struct work_struct i_vmtruncate_work;
 
+#ifdef CONFIG_CEPH_FSCACHE
+	struct fscache_cookie *fscache;
+	u32 i_fscache_gen; /* sequence, for delayed fscache validate */
+	struct work_struct i_revalidate_work;
+#endif
 	struct inode vfs_inode; /* at end */
 };
 
@@ -344,11 +363,12 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
  * x86_64+ino32  64                     32
  * x86_64        64                     64
  */
-static inline u32 ceph_ino_to_ino32(ino_t ino)
+static inline u32 ceph_ino_to_ino32(__u64 vino)
 {
-	ino ^= ino >> (sizeof(ino) * 8 - 32);
+	u32 ino = vino & 0xffffffff;
+	ino ^= vino >> 32;
 	if (!ino)
-		ino = 1;
+		ino = 2;
 	return ino;
 }
 
@@ -357,11 +377,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino)
  */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
-	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
 #if BITS_PER_LONG == 32
-	ino = ceph_ino_to_ino32(ino);
+	return ceph_ino_to_ino32(vino.ino);
+#else
+	return (ino_t)vino.ino;
 #endif
-	return ino;
 }
 
 /*
@@ -413,38 +433,35 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 /*
  * Ceph inode.
  */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
 #define CEPH_I_NODELAY   4  /* do not delay cap release */
 #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
 #define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 
-static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
+					   int release_count)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-
-	spin_lock(&inode->i_lock);
-	ci->i_ceph_flags &= ~mask;
-	spin_unlock(&inode->i_lock);
+	atomic_set(&ci->i_complete_count, release_count);
 }
 
-static inline void ceph_i_set(struct inode *inode, unsigned mask)
+static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
+	atomic_inc(&ci->i_release_count);
+}
 
-	spin_lock(&inode->i_lock);
-	ci->i_ceph_flags |= mask;
-	spin_unlock(&inode->i_lock);
+static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
+{
+	return atomic_read(&ci->i_complete_count) ==
+		atomic_read(&ci->i_release_count);
 }
 
-static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+static inline void ceph_dir_clear_complete(struct inode *inode)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	bool r;
+	__ceph_dir_clear_complete(ceph_inode(inode));
+}
 
-	spin_lock(&inode->i_lock);
-	r = (ci->i_ceph_flags & mask) == mask;
-	spin_unlock(&inode->i_lock);
-	return r;
+static inline bool ceph_dir_is_complete(struct inode *inode)
+{
+	return __ceph_dir_is_complete(ceph_inode(inode));
 }
 
 
@@ -486,9 +503,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
 static inline int ceph_caps_issued(struct ceph_inode_info *ci)
 {
 	int issued;
-	spin_lock(&ci->vfs_inode.i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	issued = __ceph_caps_issued(ci, NULL);
-	spin_unlock(&ci->vfs_inode.i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return issued;
 }
 
@@ -496,9 +513,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
 					int touch)
 {
 	int r;
-	spin_lock(&ci->vfs_inode.i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	r = __ceph_caps_issued_mask(ci, mask, touch);
-	spin_unlock(&ci->vfs_inode.i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return r;
 }
 
@@ -506,8 +523,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 {
 	return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
 
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+				      struct ceph_cap *ocap, int mask);
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
 
@@ -530,7 +549,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
 extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 			       struct ceph_cap_reservation *ctx);
@@ -543,20 +562,23 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
 /*
  * we keep buffered readdir results attached to file->private_data
  */
+#define CEPH_F_SYNC     1
+#define CEPH_F_ATEND    2
+
 struct ceph_file_info {
-	int fmode;     /* initialized on open */
+	short fmode;     /* initialized on open */
+	short flags;     /* CEPH_F_* */
 
 	/* readdir: position within the dir */
 	u32 frag;
 	struct ceph_mds_request *last_readdir;
-	int at_end;
 
 	/* readdir: position within a frag */
 	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
-	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+	unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
 	char *last_name;       /* last entry in previous chunk */
 	struct dentry *dentry; /* next dentry (for dcache readdir) */
-	unsigned long dir_release_count;
+	int dir_release_count;
 
 	/* used for -o dirstat read() on directory thing */
 	char *dir_info;
@@ -584,9 +606,9 @@ struct ceph_snap_realm {
 	u64 parent_since;   /* snapid when our current parent became so */
 
 	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
-	int num_prior_parent_snaps;   /*  had prior to parent_since */
+	u32 num_prior_parent_snaps;   /*  had prior to parent_since */
 	u64 *snaps;                   /* snaps specific to this realm */
-	int num_snaps;
+	u32 num_snaps;
 
 	struct ceph_snap_realm *parent;
 	struct list_head children;       /* list of child realms */
@@ -667,6 +689,7 @@ extern const struct inode_operations ceph_file_iops;
 
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
 				    struct ceph_vino vino);
@@ -692,7 +715,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
+extern int ceph_permission(struct inode *inode, int mask);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
@@ -700,35 +723,75 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 /* xattr.c */
 extern int ceph_setxattr(struct dentry *, const char *, const void *,
 			 size_t, int);
+int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+int __ceph_removexattr(struct dentry *, const char *);
 extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
 extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
+
+/* acl.c */
+extern const struct xattr_handler *ceph_xattr_handlers[];
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+       forget_all_cached_acls(inode);
+}
+
+#else
+
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+
+static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
+				struct inode *dir)
+{
+	return 0;
+}
+
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+	return 0;
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
 
 /* caps.c */
 extern const char *ceph_cap_string(int c);
 extern void ceph_handle_caps(struct ceph_mds_session *session,
 			     struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
-			struct ceph_mds_session *session, u64 cap_id,
-			int fmode, unsigned issued, unsigned wanted,
-			unsigned cap, unsigned seq, u64 realmino, int flags,
-			struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
-	struct inode *inode = &cap->ci->vfs_inode;
-	spin_lock(&inode->i_lock);
-	__ceph_remove_cap(cap);
-	spin_unlock(&inode->i_lock);
-}
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+				     struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+			 struct ceph_mds_session *session, u64 cap_id,
+			 int fmode, unsigned issued, unsigned wanted,
+			 unsigned cap, unsigned seq, u64 realmino, int flags,
+			 struct ceph_cap **new_cap);
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
 
+extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
+				u64 cap_id, u32 migrate_seq, u32 issue_seq);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, int datasync);
+extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
@@ -768,17 +831,11 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
-				    const char *data,
-				    loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
-				    char *data,
-				    loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+
 extern int ceph_open(struct inode *inode, struct file *file);
-extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
-				       struct nameidata *nd, int mode,
-				       int locked_dir);
+extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
+			    struct file *file, unsigned flags, umode_t mode,
+			    int *opened);
 extern int ceph_release(struct inode *inode, struct file *filp);
 
 /* dir.c */
@@ -788,6 +845,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
 	ceph_snapdir_dentry_ops;
 
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+			       struct dentry *dentry, int err);
 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 					 struct dentry *dentry, int err);
 
@@ -795,7 +854,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
-extern unsigned ceph_dentry_hash(struct dentry *dn);
+extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
+extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
 
 /*
  * our d_ops vary depending on whether the inode is live,
@@ -811,21 +871,19 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 extern const struct export_operations ceph_export_ops;
 
 /* locks.c */
+extern __init void ceph_flock_init(void);
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
 extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
-extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
-			     int p_locks, int f_locks);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+				       struct ceph_filelock *flocks,
+				       int num_fcntl_locks,
+				       int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+				  struct ceph_pagelist *pagelist,
+				  int num_fcntl_locks, int num_flock_locks);
 extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 
-static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
-{
-	if (dentry && dentry->d_parent)
-		return dentry->d_parent->d_inode;
-
-	return NULL;
-}
-
 /* debugfs.c */
 extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 8c9eba6ef9d..c9c2b887381 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -6,13 +6,33 @@
 #include <linux/ceph/decode.h>
 
 #include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
 
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr);
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	NULL,
+};
+
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, "ceph.", 5) ||
+	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
@@ -21,101 +41,235 @@ static bool ceph_is_valid_xattr(const char *name)
  * These define virtual xattrs exposing the recursive directory
  * statistics and layout metadata.
  */
-struct ceph_vxattr_cb {
-	bool readonly;
+struct ceph_vxattr {
 	char *name;
+	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
+	bool readonly, hidden;
+	bool (*exists_cb)(struct ceph_inode_info *ci);
 };
 
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+	size_t s;
+	char *p = (char *)&ci->i_layout;
+
+	for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+		if (*p)
+			return true;
+	return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				   size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+	char buf[128];
+
+	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name) {
+		size_t len = strlen(pool_name);
+		ret = snprintf(buf, sizeof(buf),
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+		if (!size) {
+			ret += len;
+		} else if (ret + len > size) {
+			ret = -ERANGE;
+		} else {
+			memcpy(val, buf, ret);
+			memcpy(val + ret, pool_name, len);
+			ret += len;
+		}
+	} else {
+		ret = snprintf(buf, sizeof(buf),
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		(unsigned long long)pool);
+		if (size) {
+			if (ret <= size)
+				memcpy(val, buf, ret);
+			else
+				ret = -ERANGE;
+		}
+	}
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+			(unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+					char *val, size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size, "%s", pool_name);
+	else
+		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
 /* directories */
 
-static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
 				      size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files);
 }
 
-static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles);
 }
 
-static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rbytes);
 }
 
-static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
-	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+	return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
 			(long)ci->i_rctime.tv_nsec);
 }
 
-static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
-	{ true, "ceph.dir.files", ceph_vxattrcb_files},
-	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
-	{ true, NULL, NULL }
-};
 
-/* files */
+#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2)	\
+	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
 
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
-				   size_t size)
-{
-	int ret;
+#define XATTR_NAME_CEPH(_type, _name)					\
+	{								\
+		.name = CEPH_XATTR_NAME(_type, _name),			\
+		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+		.readonly = true,				\
+		.hidden = false,				\
+		.exists_cb = NULL,			\
+	}
+#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
+	{								\
+		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
+		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+		.readonly = false,				\
+		.hidden = true,			\
+		.exists_cb = ceph_vxattrcb_layout_exists,	\
+	}
 
-	ret = snprintf(val, size,
-		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
-		(unsigned long long)ceph_file_layout_su(ci->i_layout),
-		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-	if (ceph_file_layout_pg_preferred(ci->i_layout))
-		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
-			    (unsigned long long)ceph_file_layout_pg_preferred(
-				    ci->i_layout));
-	return ret;
-}
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
+	{
+		.name = "ceph.dir.layout",
+		.name_size = sizeof("ceph.dir.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = true,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(dir, layout, object_size),
+	XATTR_LAYOUT_FIELD(dir, layout, pool),
+	XATTR_NAME_CEPH(dir, entries),
+	XATTR_NAME_CEPH(dir, files),
+	XATTR_NAME_CEPH(dir, subdirs),
+	XATTR_NAME_CEPH(dir, rentries),
+	XATTR_NAME_CEPH(dir, rfiles),
+	XATTR_NAME_CEPH(dir, rsubdirs),
+	XATTR_NAME_CEPH(dir, rbytes),
+	XATTR_NAME_CEPH(dir, rctime),
+	{ .name = NULL, 0 }	/* Required table terminator */
+};
+static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
+
+/* files */
 
-static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-	{ true, "ceph.layout", ceph_vxattrcb_layout},
-	{ NULL, NULL }
+static struct ceph_vxattr ceph_file_vxattrs[] = {
+	{
+		.name = "ceph.file.layout",
+		.name_size = sizeof("ceph.file.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = true,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
+	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(file, layout, object_size),
+	XATTR_LAYOUT_FIELD(file, layout, pool),
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
+static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 
-static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode))
 		return ceph_dir_vxattrs;
@@ -124,22 +278,67 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
 	return NULL;
 }
 
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	if (vxattrs == ceph_dir_vxattrs)
+		return ceph_dir_vxattrs_name_size;
+	if (vxattrs == ceph_file_vxattrs)
+		return ceph_file_vxattrs_name_size;
+	BUG();
+
+	return 0;
+}
+
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	struct ceph_vxattr *vxattr;
+	size_t size = 0;
+
+	for (vxattr = vxattrs; vxattr->name; vxattr++)
+		if (!vxattr->hidden)
+			size += vxattr->name_size;
+
+	return size;
+}
+
+/* Routines called at initialization and exit time */
+
+void __init ceph_xattr_init(void)
+{
+	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+
+void ceph_xattr_exit(void)
+{
+	ceph_dir_vxattrs_name_size = 0;
+	ceph_file_vxattrs_name_size = 0;
+}
+
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 						const char *name)
 {
-	do {
-		if (strcmp(vxattr->name, name) == 0)
-			return vxattr;
-		vxattr++;
-	} while (vxattr->name);
+	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
+
+	if (vxattr) {
+		while (vxattr->name) {
+			if (!strcmp(vxattr->name, name))
+				return vxattr;
+			vxattr++;
+		}
+	}
+
 	return NULL;
 }
 
 static int __set_xattr(struct ceph_inode_info *ci,
 			   const char *name, int name_len,
 			   const char *val, int val_len,
-			   int dirty,
-			   int should_free_name, int should_free_val,
+			   int flags, int update_xattr,
 			   struct ceph_inode_xattr **newxattr)
 {
 	struct rb_node **p;
@@ -168,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		xattr = NULL;
 	}
 
+	if (update_xattr) {
+		int err = 0;
+		if (xattr && (flags & XATTR_CREATE))
+			err = -EEXIST;
+		else if (!xattr && (flags & XATTR_REPLACE))
+			err = -ENODATA;
+		if (err) {
+			kfree(name);
+			kfree(val);
+			return err;
+		}
+		if (update_xattr < 0) {
+			if (xattr)
+				__remove_xattr(ci, xattr);
+			kfree(name);
+			return 0;
+		}
+	}
+
 	if (!xattr) {
 		new = 1;
 		xattr = *newxattr;
 		xattr->name = name;
 		xattr->name_len = name_len;
-		xattr->should_free_name = should_free_name;
+		xattr->should_free_name = update_xattr;
 
 		ci->i_xattrs.count++;
 		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
@@ -183,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		if (xattr->should_free_val)
 			kfree((void *)xattr->val);
 
-		if (should_free_name) {
+		if (update_xattr) {
 			kfree((void *)name);
 			name = xattr->name;
 		}
@@ -198,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		xattr->val = "";
 
 	xattr->val_len = val_len;
-	xattr->dirty = dirty;
-	xattr->should_free_val = (val && should_free_val);
+	xattr->dirty = update_xattr;
+	xattr->should_free_val = (val && update_xattr);
 
 	if (new) {
 		rb_link_node(&xattr->node, parent, p);
@@ -261,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci,
 			  struct ceph_inode_xattr *xattr)
 {
 	if (!xattr)
-		return -EOPNOTSUPP;
+		return -ENODATA;
 
 	rb_erase(&xattr->node, &ci->i_xattrs.index);
 
@@ -343,8 +561,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 }
 
 static int __build_xattrs(struct inode *inode)
-	__releases(inode->i_lock)
-	__acquires(inode->i_lock)
+	__releases(ci->i_ceph_lock)
+	__acquires(ci->i_ceph_lock)
 {
 	u32 namelen;
 	u32 numattr = 0;
@@ -372,7 +590,7 @@ start:
 		end = p + ci->i_xattrs.blob->vec.iov_len;
 		ceph_decode_32_safe(&p, end, numattr, bad);
 		xattr_version = ci->i_xattrs.version;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 
 		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
 				 GFP_NOFS);
@@ -387,12 +605,13 @@ start:
 				goto bad_lock;
 		}
 
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.version != xattr_version) {
 			/* lost a race, retry */
 			for (i = 0; i < numattr; i++)
 				kfree(xattrs[i]);
 			kfree(xattrs);
+			xattrs = NULL;
 			goto start;
 		}
 		err = -EIO;
@@ -406,7 +625,7 @@ start:
 			p += len;
 
 			err = __set_xattr(ci, name, namelen, val, len,
-					  0, 0, 0, &xattrs[numattr]);
+					  0, 0, &xattrs[numattr]);
 
 			if (err < 0)
 				goto bad;
@@ -418,7 +637,7 @@ start:
 
 	return err;
 bad_lock:
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 bad:
 	if (xattrs) {
 		for (i = 0; i < numattr; i++)
@@ -495,24 +714,25 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 	}
 }
 
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size)
 {
-	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int err;
 	struct ceph_inode_xattr *xattr;
-	struct ceph_vxattr_cb *vxattr = NULL;
+	struct ceph_vxattr *vxattr = NULL;
 
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 
 	/* let's see if a virtual xattr was requested */
-	if (vxattrs)
-		vxattr = ceph_match_vxattr(vxattrs, name);
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		return err;
+	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 
@@ -520,19 +740,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto get_xattr;
 	} else {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		/* get xattrs from mds (if we don't already have them) */
 		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
 		if (err)
 			return err;
 	}
 
-	spin_lock(&inode->i_lock);
-
-	if (vxattr && vxattr->readonly) {
-		err = vxattr->getxattr_cb(ci, value, size);
-		goto out;
-	}
+	spin_lock(&ci->i_ceph_lock);
 
 	err = __build_xattrs(inode);
 	if (err < 0)
@@ -541,11 +756,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 get_xattr:
 	err = -ENODATA;  /* == ENOATTR */
 	xattr = __get_xattr(ci, name);
-	if (!xattr) {
-		if (vxattr)
-			err = vxattr->getxattr_cb(ci, value, size);
+	if (!xattr)
 		goto out;
-	}
 
 	err = -ERANGE;
 	if (size && size < xattr->val_len)
@@ -558,22 +770,31 @@ get_xattr:
 	memcpy(value, xattr->val, xattr->val_len);
 
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return err;
 }
 
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, value, size);
+
+	return __ceph_getxattr(dentry->d_inode, name, value, size);
+}
+
 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
 	u32 vir_namelen = 0;
 	u32 namelen;
 	int err;
 	u32 len;
 	int i;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 
@@ -581,45 +802,53 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto list_xattr;
 	} else {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
 		if (err)
 			return err;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
 
 list_xattr:
-	vir_namelen = 0;
-	/* include virtual dir xattrs */
-	if (vxattrs)
-		for (i = 0; vxattrs[i].name; i++)
-			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/*
+	 * Start with virtual dir xattr names (if any) (including
+	 * terminating '\0' characters for each).
+	 */
+	vir_namelen = ceph_vxattrs_name_size(vxattrs);
+
 	/* adding 1 byte per each variable due to the null termination */
-	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
-	if (size && namelen > size)
+	if (size && vir_namelen + namelen > size)
 		goto out;
 
-	err = namelen;
+	err = namelen + vir_namelen;
 	if (size == 0)
 		goto out;
 
 	names = __copy_xattr_names(ci, names);
 
 	/* virtual xattr names, too */
-	if (vxattrs)
+	err = namelen;
+	if (vxattrs) {
 		for (i = 0; vxattrs[i].name; i++) {
-			len = sprintf(names, "%s", vxattrs[i].name);
-			names += len + 1;
+			if (!vxattrs[i].hidden &&
+			    !(vxattrs[i].exists_cb &&
+			      !vxattrs[i].exists_cb(ci))) {
+				len = sprintf(names, "%s", vxattrs[i].name);
+				names += len + 1;
+				err += len + 1;
+			}
 		}
+	}
 
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
 	return err;
 }
 
@@ -629,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	int err;
@@ -658,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 
 	dout("setxattr value=%.*s\n", (int)size, value);
 
+	if (!value)
+		flags |= CEPH_XATTR_REMOVE;
+
 	/* do request */
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
 				       USE_AUTH_MDS);
@@ -665,7 +896,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 		err = PTR_ERR(req);
 		goto out;
 	}
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
 	req->r_num_caps = 1;
 	req->r_args.setxattr.flags = cpu_to_le32(flags);
@@ -676,7 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 	req->r_data_len = size;
 
 	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
 	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
 
@@ -689,33 +921,32 @@ out:
 	return err;
 }
 
-int ceph_setxattr(struct dentry *dentry, const char *name,
-		  const void *value, size_t size, int flags)
+int __ceph_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int issued;
 	int err;
+	int dirty = 0;
 	int name_len = strlen(name);
 	int val_len = size;
 	char *newname = NULL;
 	char *newval = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
-	int issued;
 	int required_blob_size;
 
-	if (ceph_snap(inode) != CEPH_NOSNAP)
-		return -EROFS;
-
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
-	if (vxattrs) {
-		struct ceph_vxattr_cb *vxattr =
-			ceph_match_vxattr(vxattrs, name);
-		if (vxattr && vxattr->readonly)
-			return -EOPNOTSUPP;
-	}
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
+
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
 
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
@@ -724,20 +955,19 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 		goto out;
 
 	if (val_len) {
-		newval = kmalloc(val_len + 1, GFP_NOFS);
+		newval = kmemdup(value, val_len, GFP_NOFS);
 		if (!newval)
 			goto out;
-		memcpy(newval, value, val_len);
-		newval[val_len] = '\0';
 	}
 
 	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
 	if (!xattr)
 		goto out;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
 	__build_xattrs(inode);
@@ -746,32 +976,37 @@ retry:
 
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
-		struct ceph_buffer *blob = NULL;
+		struct ceph_buffer *blob;
 
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ci->i_ceph_lock);
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
 			goto out;
-		spin_lock(&inode->i_lock);
+		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.prealloc_blob)
 			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 		ci->i_xattrs.prealloc_blob = blob;
 		goto retry;
 	}
 
-	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
-	err = __set_xattr(ci, newname, name_len, newval,
-			  val_len, 1, 1, 1, &xattr);
-	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
-	ci->i_xattrs.dirty = true;
-	inode->i_ctime = CURRENT_TIME;
-	spin_unlock(&inode->i_lock);
+	err = __set_xattr(ci, newname, name_len, newval, val_len,
+			  flags, value ? 1 : -1, &xattr);
 
+	if (!err) {
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+		ci->i_xattrs.dirty = true;
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	spin_unlock(&ci->i_ceph_lock);
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
 	return err;
 
 do_sync:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
 	kfree(newname);
@@ -780,12 +1015,23 @@ out:
 	return err;
 }
 
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
+
+	return __ceph_setxattr(dentry, name, value, size, flags);
+}
+
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
-	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -793,56 +1039,90 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 				       USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->r_inode = igrab(inode);
+	req->r_inode = inode;
+	ihold(inode);
 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
 	req->r_num_caps = 1;
 	req->r_path2 = kstrdup(name, GFP_NOFS);
 
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
 	return err;
 }
 
-int ceph_removexattr(struct dentry *dentry, const char *name)
+int __ceph_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int issued;
 	int err;
-
-	if (ceph_snap(inode) != CEPH_NOSNAP)
-		return -EROFS;
+	int required_blob_size;
+	int dirty;
 
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
-	if (vxattrs) {
-		struct ceph_vxattr_cb *vxattr =
-			ceph_match_vxattr(vxattrs, name);
-		if (vxattr && vxattr->readonly)
-			return -EOPNOTSUPP;
-	}
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
 
-	spin_lock(&inode->i_lock);
-	__build_xattrs(inode);
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
+	err = -ENOMEM;
+	spin_lock(&ci->i_ceph_lock);
+retry:
 	issued = __ceph_caps_issued(ci, NULL);
 	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
 
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, 0, 0);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob;
+
+		spin_unlock(&ci->i_ceph_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
 
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
-	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
-
-	spin_unlock(&inode->i_lock);
-
+	spin_unlock(&ci->i_ceph_lock);
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
 	return err;
 do_sync:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_send_removexattr(dentry, name);
+out:
 	return err;
 }
 
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	return __ceph_removexattr(dentry, name);
+}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index dca9e5e0f73..f77f7702fab 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 	cd = __register_chrdev_region(major, baseminor, count, name);
 	if (IS_ERR(cd))
 		return PTR_ERR(cd);
-	
+
 	cdev = cdev_alloc();
 	if (!cdev)
 		goto out2;
@@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 	cdev->owner = fops->owner;
 	cdev->ops = fops;
 	kobject_set_name(&cdev->kobj, "%s", name);
-		
+
 	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
 	if (err)
 		goto out;
@@ -368,6 +368,7 @@ void cdev_put(struct cdev *p)
  */
 static int chrdev_open(struct inode *inode, struct file *filp)
 {
+	const struct file_operations *fops;
 	struct cdev *p;
 	struct cdev *new = NULL;
 	int ret = 0;
@@ -400,12 +401,13 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		return ret;
 
 	ret = -ENXIO;
-	filp->f_op = fops_get(p->ops);
-	if (!filp->f_op)
+	fops = fops_get(p->ops);
+	if (!fops)
 		goto out_cdev_put;
 
+	replace_fops(filp, fops);
 	if (filp->f_op->open) {
-		ret = filp->f_op->open(inode,filp);
+		ret = filp->f_op->open(inode, filp);
 		if (ret)
 			goto out_cdev_put;
 	}
@@ -471,9 +473,19 @@ static int exact_lock(dev_t dev, void *data)
  */
 int cdev_add(struct cdev *p, dev_t dev, unsigned count)
 {
+	int error;
+
 	p->dev = dev;
 	p->count = count;
-	return kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p);
+
+	error = kobj_map(cdev_map, dev, count, NULL,
+			 exact_match, exact_lock, p);
+	if (error)
+		return error;
+
+	kobject_get(p->kobj.parent);
+
+	return 0;
 }
 
 static void cdev_unmap(dev_t dev, unsigned count)
@@ -498,14 +510,20 @@ void cdev_del(struct cdev *p)
 static void cdev_default_release(struct kobject *kobj)
 {
 	struct cdev *p = container_of(kobj, struct cdev, kobj);
+	struct kobject *parent = kobj->parent;
+
 	cdev_purge(p);
+	kobject_put(parent);
 }
 
 static void cdev_dynamic_release(struct kobject *kobj)
 {
 	struct cdev *p = container_of(kobj, struct cdev, kobj);
+	struct kobject *parent = kobj->parent;
+
 	cdev_purge(p);
 	kfree(p);
+	kobject_put(parent);
 }
 
 static struct kobj_type ktype_cdev_default = {
@@ -558,7 +576,8 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-	bdi_init(&directly_mappable_cdev_bdi);
+	if (bdi_init(&directly_mappable_cdev_bdi))
+		panic("Failed to init directly mappable cdev bdi");
 }
 
 
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
deleted file mode 100644
index 7f7fa3c302a..00000000000
--- a/fs/cifs/AUTHORS
+++ /dev/null
@@ -1,55 +0,0 @@
-Original Author
-===============
-Steve French (sfrench@samba.org)
-
-The author wishes to express his appreciation and thanks to:
-Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS
-improvements. Thanks to IBM for allowing me time and test resources to pursue
-this project, to Jim McDonough from IBM (and the Samba Team) for his help, to
-the IBM Linux JFS team for explaining many esoteric Linux filesystem features.
-Jeremy Allison of the Samba team has done invaluable work in adding the server
-side of the original CIFS Unix extensions and reviewing and implementing
-portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank
-Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client)
-for proving years ago that very good smb/cifs clients could be done on Unix-like
-operating systems.  Volker Lendecke, Andrew Tridgell, Urban Widmark, John 
-Newbigin and others for their work on the Linux smbfs module.  Thanks to
-the other members of the Storage Network Industry Association CIFS Technical
-Workgroup for their work specifying this highly complex protocol and finally
-thanks to the Samba team for their technical advice and encouragement.
-
-Patch Contributors
-------------------
-Zwane Mwaikambo
-Andi Kleen
-Amrut Joshi
-Shobhit Dayal
-Sergey Vlasov
-Richard Hughes
-Yury Umanets
-Mark Hamzy (for some of the early cifs IPv6 work)
-Domen Puncer
-Jesper Juhl (in particular for lots of whitespace/formatting cleanup)
-Vince Negri and Dave Stahl (for finding an important caching bug)
-Adrian Bunk (kcalloc cleanups)
-Miklos Szeredi 
-Kazeon team for various fixes especially for 2.4 version.
-Asser Ferno (Change Notify support)
-Shaggy (Dave Kleikamp) for inumerable small fs suggestions and some good cleanup
-Gunter Kukkukk (testing and suggestions for support of old servers)
-Igor Mammedov (DFS support)
-Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
-
-Test case and Bug Report contributors
--------------------------------------
-Thanks to those in the community who have submitted detailed bug reports
-and debug of problems they have found:  Jochen Dolze, David Blaine,
-Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori,
-Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen,
-Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special
-mention to the Stanford Checker (SWAT) which pointed out many minor
-bugs in error paths.  Valuable suggestions also have come from Al Viro
-and Dave Miller.
-
-And thanks to the IBM LTC and Power test teams and SuSE testers for
-finding multiple bugs during excellent stress test runs.
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
deleted file mode 100644
index bc0025cdd1c..00000000000
--- a/fs/cifs/CHANGES
+++ /dev/null
@@ -1,1065 +0,0 @@
-Version 1.62
-------------
-Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
-to more strictly handle corrupt frames.
-
-Version 1.61
-------------
-Fix append problem to Samba servers (files opened with O_APPEND could
-have duplicated data). Fix oops in cifs_lookup. Workaround problem
-mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
-Disable use of server inode numbers when server only
-partially supports them (e.g. for one server querying inode numbers on
-FindFirst fails but QPathInfo queries works). Fix oops with dfs in 
-cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
-for OpenOffice when on forcedirectio mount e.g.)
-
-Version 1.60
--------------
-Fix memory leak in reconnect.  Fix oops in DFS mount error path.
-Set s_maxbytes to smaller (the max that vfs can handle) so that
-sendfile will now work over cifs mounts again.  Add noforcegid
-and noforceuid mount parameters. Fix small mem leak when using
-ntlmv2. Fix 2nd mount to same server but with different port to
-be allowed (rather than reusing the 1st port) - only when the
-user explicitly overrides the port on the 2nd mount.
-
-Version 1.59
-------------
-Client uses server inode numbers (which are persistent) rather than
-client generated ones by default (mount option "serverino" turned
-on by default if server supports it).  Add forceuid and forcegid
-mount options (so that when negotiating unix extensions specifying
-which uid mounted does not immediately force the server's reported
-uids to be overridden).  Add support for scope mount parm. Improve
-hard link detection to use same inode for both.  Do not set
-read-only dos attribute on directories (for chmod) since Windows
-explorer special cases this attribute bit for directories for
-a different purpose.
-
-Version 1.58
-------------
-Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
-when the UTF-8 string is composed of unusually long (more than 4 byte) converted
-characters. Add support for mounting root of a share which redirects immediately
-to DFS target. Convert string conversion functions from Unicode to more
-accurately mark string length before allocating memory (which may help the
-rare cases where a UTF-8 string is much larger than the UCS2 string that
-we converted from).  Fix endianness of the vcnum field used during
-session setup to distinguish multiple mounts to same server from different
-userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
-flag to be set to 2, and mount must enable krb5 to turn on extended security).
-Performance of file create to Samba improved (posix create on lookup
-removes 1 of 2 network requests sent on file create)
- 
-Version 1.57
-------------
-Improve support for multiple security contexts to the same server. We
-used to use the same "vcnumber" for all connections which could cause
-the server to treat subsequent connections, especially those that
-are authenticated as guest, as reconnections, invalidating the earlier
-user's smb session.  This fix allows cifs to mount multiple times to the
-same server with different userids without risking invalidating earlier
-established security contexts.  fsync now sends SMB Flush operation
-to better ensure that we wait for server to write all of the data to
-server disk (not just write it over the network).  Add new mount
-parameter to allow user to disable sending the (slow) SMB flush on
-fsync if desired (fsync still flushes all cached write data to the server).
-Posix file open support added (turned off after one attempt if server
-fails to support it properly, as with Samba server versions prior to 3.3.2)
-Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
-little memory for the "nativeFileSystem" field returned by the server
-during mount).  Endian convert inode numbers if necessary (makes it easier
-to compare inode numbers on network files from big endian systems). 
-
-Version 1.56
-------------
-Add "forcemandatorylock" mount option to allow user to use mandatory
-rather than posix (advisory) byte range locks, even though server would
-support posix byte range locks.  Fix query of root inode when prefixpath
-specified and user does not have access to query information about the
-top of the share.  Fix problem in 2.6.28 resolving DFS paths to
-Samba servers (worked to Windows).  Fix rmdir so that pending search
-(readdir) requests do not get invalid results which include the now
-removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
-when using DFS.  Add better file create support to servers which support
-the CIFS POSIX protocol extensions (this adds support for new flags
-on create, and improves semantics for write of locked ranges).
-
-Version 1.55
-------------
-Various fixes to make delete of open files behavior more predictable
-(when delete of an open file fails we mark the file as "delete-on-close"
-in a way that more servers accept, but only if we can first rename the
-file to a temporary name).  Add experimental support for more safely
-handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
-sends, and also let tcp autotune the socket send and receive buffers.
-This reduces the number of EAGAIN errors returned by TCP/IP in
-high stress workloads (and the number of retries on socket writes
-when sending large SMBWriteX requests).  Fix case in which a portion of
-data can in some cases not get written to the file on the server before the
-file is closed.  Fix DFS parsing to properly handle path consumed field,
-and to handle certain codepage conversions better.  Fix mount and
-umount race that can cause oops in mount or umount or reconnect.
-
-Version 1.54
-------------
-Fix premature write failure on congested networks (we would give up
-on EAGAIN from the socket too quickly on large writes).
-Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
-Fix endian problems in acl (mode from/to cifs acl) on bigendian
-architectures.  Fix problems with preserving timestamps on copying open
-files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
-on parent directory when server supports Unix Extensions but not POSIX
-create. Update cifs.upcall version to handle new Kerberos sec flags
-(this requires update of cifs.upcall program from Samba).  Fix memory leak
-on dns_upcall (resolving DFS referralls).  Fix plain text password
-authentication (requires setting SecurityFlags to 0x30030 to enable
-lanman and plain text though).  Fix writes to be at correct offset when
-file is open with O_APPEND and file is on a directio (forcediretio) mount.
-Fix bug in rewinding readdir directory searches.  Add nodfs mount option.
-
-Version 1.53
-------------
-DFS support added (Microsoft Distributed File System client support needed
-for referrals which enable a hierarchical name space among servers).
-Disable temporary caching of mode bits to servers which do not support
-storing of mode (e.g. Windows servers, when client mounts without cifsacl
-mount option) and add new "dynperm" mount option to enable temporary caching
-of mode (enable old behavior).  Fix hang on mount caused when server crashes
-tcp session during negotiate protocol.
-
-Version 1.52
-------------
-Fix oops on second mount to server when null auth is used.
-Enable experimental Kerberos support.  Return writebehind errors on flush
-and sync so that events like out of disk space get reported properly on
-cached files. Fix setxattr failure to certain Samba versions. Fix mount
-of second share to disconnected server session (autoreconnect on this).
-Add ability to modify cifs acls for handling chmod (when mounted with
-cifsacl flag). Fix prefixpath path separator so we can handle mounts
-with prefixpaths longer than one directory (one path component) when
-mounted to Windows servers.  Fix slow file open when cifsacl
-enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
-
-
-Version 1.51
-------------
-Fix memory leak in statfs when mounted to very old servers (e.g.
-Windows 9x).  Add new feature "POSIX open" which allows servers
-which support the current POSIX Extensions to provide better semantics
-(e.g. delete for open files opened with posix open).  Take into
-account umask on posix mkdir not just older style mkdir.  Add
-ability to mount to IPC$ share (which allows CIFS named pipes to be
-opened, read and written as if they were files).  When 1st tree
-connect fails (e.g. due to signing negotiation failure) fix
-leak that causes cifsd not to stop and rmmod to fail to cleanup
-cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
-bigendian architectures. Fix possible memory corruption when
-EAGAIN returned on kern_recvmsg. Return better error if server
-requires packet signing but client has disabled it. When mounted
-with cifsacl mount option - mode bits are approximated based
-on the contents of the ACL of the file or directory. When cifs
-mount helper is missing convert make sure that UNC name 
-has backslash (not forward slash) between ip address of server
-and the share name.
-
-Version 1.50
-------------
-Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
-done with "serverino" mount option).  Add support for POSIX Unlink
-(helps with certain sharing violation cases when server such as
-Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
-mount option to allow disabling the CIFS Unix Extensions for just
-that mount. Fix hang on spinlock in find_writable_file (race when
-reopening file after session crash).  Byte range unlock request to
-windows server could unlock more bytes (on server copy of file)
-than intended if start of unlock request is well before start of
-a previous byte range lock that we issued.
-
-Version 1.49
-------------
-IPv6 support.  Enable ipv6 addresses to be passed on mount (put the ipv6
-address after the "ip=" mount option, at least until mount.cifs is fixed to
-handle DNS host to ipv6 name translation).  Accept override of uid or gid
-on mount even when Unix Extensions are negotiated (it used to be ignored
-when Unix Extensions were ignored).  This allows users to override the
-default uid and gid for files when they are certain that the uids or
-gids on the server do not match those of the client.  Make "sec=none"
-mount override username (so that null user connection is attempted)
-to match what documentation said. Support for very large reads, over 127K,
-available to some newer servers (such as Samba 3.0.26 and later but
-note that it also requires setting CIFSMaxBufSize at module install
-time to a larger value which may hurt performance in some cases).
-Make sign option force signing (or fail if server does not support it).
-
-Version 1.48
-------------
-Fix mtime bouncing around from local idea of last write times to remote time.
-Fix hang (in i_size_read) when simultaneous size update of same remote file
-on smp system corrupts sequence number. Do not reread unnecessarily partial page
-(which we are about to overwrite anyway) when writing out file opened rw.
-When DOS attribute of file on non-Unix server's file changes on the server side
-from read-only back to read-write, reflect this change in default file mode
-(we had been leaving a file's mode read-only until the inode were reloaded).
-Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
-when archive dos attribute not set and we are changing mode back to writeable
-on server which does not support the Unix Extensions).  Remove read only dos
-attribute on chmod when adding any write permission (ie on any of
-user/group/other (not all of user/group/other ie  0222) when
-mounted to windows.  Add support for POSIX MkDir (slight performance
-enhancement and eliminates the network race between the mkdir and set 
-path info of the mode).
-
-
-Version 1.47
-------------
-Fix oops in list_del during mount caused by unaligned string.
-Fix file corruption which could occur on some large file
-copies caused by writepages page i/o completion bug.
-Seek to SEEK_END forces check for update of file size for non-cached
-files. Allow file size to be updated on remote extend of locally open,
-non-cached file.  Fix reconnect to newer Samba servers (or other servers
-which support the CIFS Unix/POSIX extensions) so that we again tell the
-server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
-Add experimental support for new POSIX Open/Mkdir (which returns
-stat information on the open, and allows setting the mode).
-
-Version 1.46
-------------
-Support deep tree mounts.  Better support OS/2, Win9x (DOS) time stamps.
-Allow null user to be specified on mount ("username="). Do not return
-EINVAL on readdir when filldir fails due to overwritten blocksize
-(fixes FC problem).  Return error in rename 2nd attempt retry (ie report
-if rename by handle also fails, after rename by path fails, we were
-not reporting whether the retry worked or not). Fix NTLMv2 to
-work to Windows servers (mount with option "sec=ntlmv2").
-
-Version 1.45
-------------
-Do not time out lockw calls when using posix extensions. Do not
-time out requests if server still responding reasonably fast
-on requests on other threads.  Improve POSIX locking emulation,
-(lock cancel now works, and unlock of merged range works even
-to Windows servers now).  Fix oops on mount to lanman servers
-(win9x, os/2 etc.) when null password.  Do not send listxattr
-(SMB to query all EAs) if nouser_xattr specified.  Fix SE Linux
-problem (instantiate inodes/dentries in right order for readdir).
-
-Version 1.44
-------------
-Rewritten sessionsetup support, including support for legacy SMB
-session setup needed for OS/2 and older servers such as Windows 95 and 98.
-Fix oops on ls to OS/2 servers.  Add support for level 1 FindFirst
-so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
-or recent levels of FindFirst unless server says it supports NT SMBs
-(instead use legacy equivalents from LANMAN dialect). Fix to allow
-NTLMv2 authentication support (now can use stronger password hashing
-on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
-Allow override of global cifs security flags on mount via "sec=" option(s).
-
-Version 1.43
-------------
-POSIX locking to servers which support CIFS POSIX Extensions
-(disabled by default controlled by proc/fs/cifs/Experimental).
-Handle conversion of long share names (especially Asian languages)
-to Unicode during mount. Fix memory leak in sess struct on reconnect.
-Fix rare oops after acpi suspend.  Fix O_TRUNC opens to overwrite on
-cifs open which helps rare case when setpathinfo fails or server does
-not support it. 
-
-Version 1.42
-------------
-Fix slow oplock break when mounted to different servers at the same time and
-the tids match and we try to find matching fid on wrong server. Fix read
-looping when signing required by server (2.6.16 kernel only). Fix readdir
-vs. rename race which could cause each to hang. Return . and .. even
-if server does not.  Allow searches to skip first three entries and
-begin at any location. Fix oops in find_writeable_file.
-
-Version 1.41
-------------
-Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
-configure stronger authentication.  Fix sfu symlinks so they can
-be followed (not just recognized).  Fix wraparound of bcc on
-read responses when buffer size over 64K and also fix wrap of
-max smb buffer size when CIFSMaxBufSize over 64K.  Fix oops in
-cifs_user_read and cifs_readpages (when EAGAIN on send of smb
-on socket is returned over and over).  Add POSIX (advisory) byte range
-locking support (requires server with newest CIFS UNIX Extensions
-to the protocol implemented). Slow down negprot slightly in port 139
-RFC1001 case to give session_init time on buggy servers.
-
-Version 1.40
-------------
-Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
-of readpages by eliminating one extra memcpy. Allow update of file size
-from remote server even if file is open for write as long as mount is
-directio.  Recognize share mode security and send NTLM encrypted password
-on tree connect if share mode negotiated.
-
-Version 1.39
-------------
-Defer close of a file handle slightly if pending writes depend on that handle
-(this reduces the EBADF bad file handle errors that can be logged under heavy
-stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 
-Fix SFU style symlinks and mknod needed for servers which do not support the
-CIFS Unix Extensions.  Fix setfacl/getfacl on bigendian. Timeout negative
-dentries so files that the client sees as deleted but that later get created
-on the server will be recognized.  Add client side permission check on setattr.
-Timeout stuck requests better (where server has never responded or sent corrupt
-responses)
-
-Version 1.38
-------------
-Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
-to be smaller at first (but increasing) so large write performance performance
-over GigE is better.  Do not hang thread on illegal byte range lock response
-from Windows (Windows can send an RFC1001 size which does not match smb size) by
-allowing an SMBs TCP length to be up to a few bytes longer than it should be.
-wsize and rsize can now be larger than negotiated buffer size if server
-supports large readx/writex, even when directio mount flag not specified.
-Write size will in many cases now be 16K instead of 4K which greatly helps
-file copy performance on lightly loaded networks.  Fix oops in dnotify
-when experimental config flag enabled. Make cifsFYI more granular.
-
-Version 1.37
-------------
-Fix readdir caching when unlink removes file in current search buffer,
-and this is followed by a rewind search to just before the deleted entry.
-Do not attempt to set ctime unless atime and/or mtime change requested
-(most servers throw it away anyway). Fix length check of received smbs
-to be more accurate. Fix big endian problem with mapchars mount option,
-and with a field returned by statfs.
-
-Version 1.36
-------------
-Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
-For these older servers, add option for passing netbios name of server in
-on mount (servernetbiosname).  Add suspend support for power management, to
-avoid cifsd thread preventing software suspend from working.
-Add mount option for disabling the default behavior of sending byte range lock
-requests to the server (necessary for certain applications which break with
-mandatory lock behavior such as Evolution), and also mount option for
-requesting case insensitive matching for path based requests (requesting
-case sensitive is the default).
-
-Version 1.35
-------------
-Add writepage performance improvements.  Fix path name conversions
-for long filenames on mounts which were done with "mapchars" mount option
-specified.  Ensure multiplex ids do not collide.  Fix case in which 
-rmmod can oops if done soon after last unmount.  Fix truncated
-search (readdir) output when resume filename was a long filename.
-Fix filename conversion when mapchars mount option was specified and
-filename was a long filename.
-
-Version 1.34
-------------
-Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
-Do not oops if root user kills cifs oplock kernel thread or
-kills the cifsd thread (NB: killing the cifs kernel threads is not
-recommended, unmount and rmmod cifs will kill them when they are
-no longer needed).  Fix readdir to ASCII servers (ie older servers
-which do not support Unicode) and also require asterisk.
-Fix out of memory case in which data could be written one page
-off in the page cache.
-
-Version 1.33
-------------
-Fix caching problem, in which readdir of directory containing a file
-which was cached could cause the file's time stamp to be updated
-without invalidating the readahead data (so we could get stale
-file data on the client for that file even as the server copy changed).
-Cleanup response processing so cifsd can not loop when abnormally
-terminated.
-
-
-Version 1.32
-------------
-Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
-transact response for an SMB request and search entry split across two frames.
-Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
-as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
-unless server explicitly claims to support them in CIFS Unix extensions
-POSIX ACL capability bit. Fix packet signing when multiuser mounting with
-different users from the same client to the same server. Fix oops in
-cifs_close. Add mount option for remapping reserved characters in
-filenames (also allow recognizing files with created by SFU which have any
-of these seven reserved characters, except backslash, to be recognized).
-Fix invalid transact2 message (we were sometimes trying to interpret
-oplock breaks as SMB responses). Add ioctl for checking that the
-current uid matches the uid of the mounter (needed by umount.cifs).
-Reduce the number of large buffer allocations in cifs response processing
-(significantly reduces memory pressure under heavy stress with multiple
-processes accessing the same server at the same time).
-
-Version 1.31
-------------
-Fix updates of DOS attributes and time fields so that files on NT4 servers
-do not get marked delete on close. Display sizes of cifs buffer pools in
-cifs stats. Fix oops in unmount when cifsd thread being killed by 
-shutdown. Add generic readv/writev and aio support. Report inode numbers 
-consistently in readdir and lookup (when serverino mount option is
-specified use the inode number that the server reports - for both lookup
-and readdir, otherwise by default the locally generated inode number is used
-for inodes created in either path since servers are not always able to 
-provide unique inode numbers when exporting multiple volumes from under one
-sharename).
-
-Version 1.30
-------------
-Allow new nouser_xattr mount parm to disable xattr support for user namespace.
-Do not flag user_xattr mount parm in dmesg.  Retry failures setting file time  
-(mostly affects NT4 servers) by retry with handle based network operation. 
-Add new POSIX Query FS Info for returning statfs info more accurately.
-Handle passwords with multiple commas in them.
-
-Version 1.29
-------------
-Fix default mode in sysfs of cifs module parms.  Remove old readdir routine.
-Fix capabilities flags for large readx so as to allow reads larger than 64K.
-
-Version 1.28
-------------
-Add module init parm for large SMB buffer size (to allow it to be changed
-from its default of 16K) which is especially useful for large file copy
-when mounting with the directio mount option. Fix oops after 
-returning from mount when experimental ExtendedSecurity enabled and
-SpnegoNegotiated returning invalid error. Fix case to retry better when 
-peek returns from 1 to 3 bytes on socket which should have more data.
-Fixed path based calls (such as cifs lookup) to handle path names
-longer than 530 (now can handle PATH_MAX). Fix pass through authentication
-from Samba server to DC (Samba required dummy LM password).
-
-Version 1.27
-------------
-Turn off DNOTIFY (directory change notification support) by default
-(unless built with the experimental flag) to fix hang with KDE
-file browser. Fix DNOTIFY flag mappings.  Fix hang (in wait_event
-waiting on an SMB response) in SendReceive when session dies but
-reconnects quickly from another task.  Add module init  parms for
-minimum number of large and small network buffers in the buffer pools,
-and for the maximum number of simultaneous requests.
-
-Version 1.26
-------------
-Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
-and other POSIX CIFS compliant servers.  Fix error mapping for getfacl 
-to EOPNOTSUPP when server does not support posix acls on the wire. Fix 
-improperly zeroed buffer in CIFS Unix extensions set times call. 
-
-Version 1.25
-------------
-Fix internationalization problem in cifs readdir with filenames that map to 
-longer UTF-8 strings than the string on the wire was in Unicode.  Add workaround
-for readdir to netapp servers. Fix search rewind (seek into readdir to return 
-non-consecutive entries).  Do not do readdir when server negotiates 
-buffer size to small to fit filename. Add support for reading POSIX ACLs from
-the server (add also acl and noacl mount options).
-
-Version 1.24
-------------
-Optionally allow using server side inode numbers, rather than client generated
-ones by specifying mount option "serverino" - this is required for some apps
-to work which double check hardlinked files and have persistent inode numbers.
-
-Version 1.23
-------------
-Multiple bigendian fixes. On little endian systems (for reconnect after
-network failure) fix tcp session reconnect code so we do not try first
-to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
-as directories rather than symlinks because we can do follow link on them.
-
-Version 1.22
-------------
-Add config option to enable XATTR (extended attribute) support, mapping
-xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
-minor fixes pointed out by the Stanford SWAT checker (mostly missing
-or out of order NULL pointer checks in little used error paths).
-
-Version 1.21
-------------
-Add new mount parm to control whether mode check (generic_permission) is done
-on the client.  If Unix extensions are enabled and the uids on the client
-and server do not match, client permission checks are meaningless on
-server uids that do not exist on the client (this does not affect the
-normal ACL check which occurs on the server).  Fix default uid
-on mknod to match create and mkdir. Add optional mount parm to allow
-override of the default uid behavior (in which the server sets the uid
-and gid of newly created files). Normally for network filesystem mounts
-user want the server to set the uid/gid on newly created files (rather than 
-using uid of the client processes you would in a local filesystem).
-
-Version 1.20
-------------
-Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
-info into /proc/fs/cifs/DebugData.  Fix oops in rare oops in readdir 
-(in build_wildcard_path_from_dentry).  Fix mknod to pass type field
-(block/char/fifo) properly.  Remove spurious mount warning log entry when
-credentials passed as mount argument. Set major/minor device number in
-inode for block and char devices when unix extensions enabled.
-
-Version 1.19
-------------
-Fix /proc/fs/cifs/Stats and DebugData display to handle larger
-amounts of return data. Properly limit requests to MAX_REQ (50
-is the usual maximum active multiplex SMB/CIFS requests per server).
-Do not kill cifsd (and thus hurt the other SMB session) when more than one
-session to the same server (but with different userids) exists and one
-of the two user's smb sessions is being removed while leaving the other.
-Do not loop reconnecting in cifsd demultiplex thread when admin
-kills the thread without going through unmount.
-
-Version 1.18
-------------
-Do not rename hardlinked files (since that should be a noop). Flush
-cached write behind data when reopening a file after session abend,
-except when already in write. Grab per socket sem during reconnect 
-to avoid oops in sendmsg if overlapping with reconnect. Do not
-reset cached inode file size on readdir for files open for write on 
-client.
-
-
-Version 1.17
-------------
-Update number of blocks in file so du command is happier (in Linux a fake
-blocksize of 512 is required for calculating number of blocks in inode).
-Fix prepare write of partial pages to read in data from server if possible.
-Fix race on tcpStatus field between unmount and reconnection code, causing
-cifsd process sometimes to hang around forever. Improve out of memory
-checks in cifs_filldir
-
-Version 1.16
-------------
-Fix incorrect file size in file handle based setattr on big endian hardware.
-Fix oops in build_path_from_dentry when out of memory.  Add checks for invalid
-and closing file structs in writepage/partialpagewrite.  Add statistics
-for each mounted share (new menuconfig option). Fix endianness problem in
-volume information displayed in /proc/fs/cifs/DebugData (only affects
-affects big endian architectures). Prevent renames while constructing
-path names for open, mkdir and rmdir.
-
-Version 1.15
-------------
-Change to mempools for alloc smb request buffers and multiplex structs
-to better handle low memory problems (and potential deadlocks).
-
-Version 1.14
-------------
-Fix incomplete listings of large directories on Samba servers when Unix
-extensions enabled.  Fix oops when smb_buffer can not be allocated. Fix
-rename deadlock when writing out dirty pages at same time.
-
-Version 1.13
-------------
-Fix open of files in which O_CREATE can cause the mode to change in
-some cases. Fix case in which retry of write overlaps file close.
-Fix PPC64 build error.  Reduce excessive stack usage in smb password
-hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
-
-Version 1.12
-------------
-Fixes for large file copy, signal handling, socket retry, buffer
-allocation and low memory situations.
-
-Version 1.11
-------------
-Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
-also now allowing support for specifying client netbiosname.  NT4 support added.
-
-Version 1.10
-------------
-Fix reconnection (and certain failed mounts) to properly wake up the
-blocked users thread so it does not seem hung (in some cases was blocked
-until the cifs receive timeout expired). Fix spurious error logging
-to kernel log when application with open network files killed. 
-
-Version 1.09
-------------
-Fix /proc/fs module unload warning message (that could be logged
-to the kernel log). Fix intermittent failure in connectathon
-test7 (hardlink count not immediately refreshed in case in which
-inode metadata can be incorrectly kept cached when time near zero)
-
-Version 1.08
-------------
-Allow file_mode and dir_mode (specified at mount time) to be enforced
-locally (the server already enforced its own ACLs too) for servers
-that do not report the correct mode (do not support the 
-CIFS Unix Extensions).
-
-Version 1.07
-------------
-Fix some small memory leaks in some unmount error paths. Fix major leak
-of cache pages in readpages causing multiple read oriented stress
-testcases (including fsx, and even large file copy) to fail over time. 
-
-Version 1.06
-------------
-Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
-This allows files that differ only in case and improves performance of file
-creation and file open to such servers.  Fix semaphore conflict which causes 
-slow delete of open file to Samba (which unfortunately can cause an oplock
-break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
-
-Version 1.05
-------------
-fixes to cifs_readpages for fsx test case
-
-Version 1.04
-------------
-Fix caching data integrity bug when extending file size especially when no
-oplock on file.  Fix spurious logging of valid already parsed mount options
-that are parsed outside of the cifs vfs such as nosuid.
-
-
-Version 1.03
-------------
-Connect to server when port number override not specified, and tcp port
-unitialized.  Reset search to restart at correct file when kernel routine
-filldir returns error during large directory searches (readdir). 
-
-Version 1.02
-------------
-Fix caching problem when files opened by multiple clients in which 
-page cache could contain stale data, and write through did
-not occur often enough while file was still open when read ahead
-(read oplock) not allowed.  Treat "sep=" when first mount option
-as an override of comma as the default separator between mount
-options. 
-
-Version 1.01
-------------
-Allow passwords longer than 16 bytes. Allow null password string.
-
-Version 1.00
-------------
-Gracefully clean up failed mounts when attempting to mount to servers such as
-Windows 98 that terminate tcp sessions during protocol negotiation.  Handle
-embedded commas in mount parsing of passwords.
-
-Version 0.99
-------------
-Invalidate local inode cached pages on oplock break and when last file
-instance is closed so that the client does not continue using stale local
-copy rather than later modified server copy of file.  Do not reconnect
-when server drops the tcp session prematurely before negotiate
-protocol response.  Fix oops in reopen_file when dentry freed.  Allow
-the support for CIFS Unix Extensions to be disabled via proc interface.
-
-Version 0.98
-------------
-Fix hang in commit_write during reconnection of open files under heavy load.
-Fix unload_nls oops in a mount failure path. Serialize writes to same socket
-which also fixes any possible races when cifs signatures are enabled in SMBs
-being sent out of signature sequence number order.    
-
-Version 0.97
-------------
-Fix byte range locking bug (endian problem) causing bad offset and
-length.
-
-Version 0.96
-------------
-Fix oops (in send_sig) caused by CIFS unmount code trying to
-wake up the demultiplex thread after it had exited. Do not log
-error on harmless oplock release of closed handle.
-
-Version 0.95
-------------
-Fix unsafe global variable usage and password hash failure on gcc 3.3.1
-Fix problem reconnecting secondary mounts to same server after session 
-failure.  Fix invalid dentry - race in mkdir when directory gets created
-by another client between the lookup and mkdir.
- 
-Version 0.94
-------------
-Fix to list processing in reopen_files. Fix reconnection when server hung
-but tcpip session still alive.  Set proper timeout on socket read.
-
-Version 0.93
-------------
-Add missing mount options including iocharset.  SMP fixes in write and open. 
-Fix errors in reconnecting after TCP session failure.  Fix module unloading
-of default nls codepage
-
-Version 0.92
-------------
-Active smb transactions should never go negative (fix double FreeXid). Fix
-list processing in file routines. Check return code on kmalloc in open.
-Fix spinlock usage for SMP.
-
-Version 0.91
-------------
-Fix oops in reopen_files when invalid dentry. drop dentry on server rename 
-and on revalidate errors. Fix cases where pid is now tgid.  Fix return code
-on create hard link when server does not support them. 
-
-Version 0.90
-------------
-Fix scheduling while atomic error in getting inode info on newly created file. 
-Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
-
-Version 0.89
-------------
-Fix oops on write to dead tcp session. Remove error log write for case when file open
-O_CREAT but not O_EXCL
-
-Version 0.88
-------------
-Fix non-POSIX behavior on rename of open file and delete of open file by taking 
-advantage of trans2 SetFileInfo rename facility if available on target server.
-Retry on ENOSPC and EAGAIN socket errors.
-
-Version 0.87
-------------
-Fix oops on big endian readdir.  Set blksize to be even power of two (2**blkbits) to fix
-allocation size miscalculation. After oplock token lost do not read through
-cache. 
-
-Version 0.86
-------------
-Fix oops on empty file readahead.  Fix for file size handling for locally cached files.
-
-Version 0.85
-------------
-Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
-during auto reconnection to server after server recovered from failure.
-
-Version 0.84
-------------
-Finish support for Linux 2.5 open/create changes, which removes the
-redundant NTCreate/QPathInfo/close that was sent during file create.
-Enable oplock by default. Enable packet signing by default (needed to 
-access many recent Windows servers)
-
-Version 0.83
-------------
-Fix oops when mounting to long server names caused by inverted parms to kmalloc.
-Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
-we will choose a cifs user session (smb uid) that better matches the local
-uid if a) the mount uid does not match the current uid and b) we have another
-session to the same server (ip address) for a different mount which
-matches the current local uid.
-
-Version 0.82
-------------
-Add support for mknod of block or character devices.  Fix oplock
-code (distributed caching) to properly send response to oplock
-break from server.
-
-Version 0.81
-------------
-Finish up CIFS packet digital signing for the default
-NTLM security case. This should help Windows 2003
-network interoperability since it is common for
-packet signing to be required now. Fix statfs (stat -f)
-which recently started returning errors due to 
-invalid value (-1 instead of 0) being set in the
-struct kstatfs f_ffiles field.
-
-Version 0.80
------------
-Fix oops on stopping oplock thread when removing cifs when
-built as module.
-
-Version 0.79
-------------
-Fix mount options for ro (readonly), uid, gid and file and directory mode. 
-
-Version 0.78
-------------
-Fix errors displayed on failed mounts to be more understandable.
-Fixed various incorrect or misleading smb to posix error code mappings.
-
-Version 0.77
-------------
-Fix display of NTFS DFS junctions to display as symlinks.
-They are the network equivalent.  Fix oops in 
-cifs_partialpagewrite caused by missing spinlock protection
-of openfile linked list.  Allow writebehind caching errors to 
-be returned to the application at file close.
-
-Version 0.76
-------------
-Clean up options displayed in /proc/mounts by show_options to
-be more consistent with other filesystems.
-
-Version 0.75
-------------
-Fix delete of readonly file to Windows servers.  Reflect
-presence or absence of read only dos attribute in mode
-bits for servers that do not support CIFS Unix extensions.
-Fix shortened results on readdir of large directories to
-servers supporting CIFS Unix extensions (caused by
-incorrect resume key).
-
-Version 0.74
-------------
-Fix truncate bug (set file size) that could cause hangs e.g. running fsx
-
-Version 0.73
-------------
-unload nls if mount fails.
-
-Version 0.72
-------------
-Add resume key support to search (readdir) code to workaround
-Windows bug.  Add /proc/fs/cifs/LookupCacheEnable which
-allows disabling caching of attribute information for
-lookups.
-
-Version 0.71
-------------
-Add more oplock handling (distributed caching code).  Remove
-dead code.  Remove excessive stack space utilization from
-symlink routines.
-
-Version 0.70
-------------
-Fix oops in get dfs referral (triggered when null path sent in to
-mount).  Add support for overriding rsize at mount time.
-
-Version 0.69
-------------
-Fix buffer overrun in readdir which caused intermittent kernel oopses.
-Fix writepage code to release kmap on write data.  Allow "-ip=" new 
-mount option to be passed in on parameter distinct from the first part
-(server name portion of) the UNC name.  Allow override of the
-tcp port of the target server via new mount option "-port="  
-
-Version 0.68
-------------
-Fix search handle leak on rewind.  Fix setuid and gid so that they are 
-reflected in the local inode immediately.  Cleanup of whitespace
-to make 2.4 and 2.5 versions more consistent.
-
-
-Version 0.67
-------------
-Fix signal sending so that captive thread (cifsd) exits on umount 
-(which was causing the warning in kmem_cache_free of the request buffers
-at rmmod time).  This had broken as a sideeffect of the recent global
-kernel change to daemonize.  Fix memory leak in readdir code which
-showed up in "ls -R" (and applications that did search rewinding).
-
-Version 0.66
-------------
-Reconnect tids and fids after session reconnection (still do not
-reconnect byte range locks though).  Fix problem caching
-lookup information for directory inodes, improving performance,
-especially in deep directory trees.  Fix various build warnings.
-
-Version 0.65
-------------
-Finish fixes to commit write for caching/readahead consistency.  fsx 
-now works to Samba servers.  Fix oops caused when readahead
-was interrupted by a signal.
-
-Version 0.64
-------------
-Fix data corruption (in partial page after truncate) that caused fsx to
-fail to Windows servers.  Cleaned up some extraneous error logging in
-common error paths.  Add generic sendfile support.
-
-Version 0.63
-------------
-Fix memory leak in AllocMidQEntry.
-Finish reconnection logic, so connection with server can be dropped
-(or server rebooted) and the cifs client will reconnect.  
-
-Version 0.62
-------------
-Fix temporary socket leak when bad userid or password specified 
-(or other SMBSessSetup failure).  Increase maximum buffer size to slightly
-over 16K to allow negotiation of up to Samba and Windows server default read 
-sizes.  Add support for readpages
-
-Version 0.61
-------------
-Fix oops when username not passed in on mount.  Extensive fixes and improvements
-to error logging (strip redundant newlines, change debug macros to ensure newline
-passed in and to be more consistent).  Fix writepage wrong file handle problem,
-a readonly file handle could be incorrectly used to attempt to write out
-file updates through the page cache to multiply open files.  This could cause
-the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
-shares to the same Windows server when using different usernames
-(doing this to Samba servers worked but Windows was rejecting it) - now it is
-possible to use different userids when connecting to the same server from a
-Linux client. Fix oops when treeDisconnect called during unmount on
-previously freed socket.
-
-Version 0.60
-------------
-Fix oops in readpages caused by not setting address space operations in inode in 
-rare code path. 
-
-Version 0.59
-------------
-Includes support for deleting of open files and renaming over existing files (per POSIX
-requirement).  Add readlink support for Windows junction points (directory symlinks).
-
-Version 0.58
-------------
-Changed read and write to go through pagecache. Added additional address space operations.
-Memory mapped operations now working.
-
-Version 0.57
-------------
-Added writepage code for additional memory mapping support.  Fixed leak in xids causing
-the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on 
-every stat call.  Additional formatting cleanup. 
-
-Version 0.56
-------------
-Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version.  Formatting cleanup.   
-
-Version 0.55
-------------
-Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
-Also included a modified version of his fix to protect global list manipulation of
-the smb session and tree connection and mid related global variables.
-
-Version 0.54
-------------
-Fix problem with captive thread hanging around at unmount time.  Adjust to 2.5.42-pre
-changes to superblock layout.   Remove wasteful allocation of smb buffers (now the send 
-buffer is reused for responses).  Add more oplock handling. Additional minor cleanup.
-
-Version 0.53
-------------
-More stylistic updates to better match kernel style.  Add additional statistics
-for filesystem which can be viewed via /proc/fs/cifs.  Add more pieces of NTLMv2
-and CIFS Packet Signing enablement.
-
-Version 0.52
-------------
-Replace call to sleep_on with safer wait_on_event.
-Make stylistic changes to better match kernel style recommendations.
-Remove most typedef usage (except for the PDUs themselves).
-
-Version 0.51
-------------
-Update mount so the -unc mount option is no longer required (the ip address can be specified
-in a UNC style device name.   Implementation of readpage/writepage started.
-
-Version 0.50
-------------
-Fix intermittent problem with incorrect smb header checking on badly 
-fragmented tcp responses
-
-Version 0.49
-------------
-Fixes to setting of allocation size and file size.
-
-Version 0.48
-------------
-Various 2.5.38 fixes.  Now works on 2.5.38
-
-Version 0.47
-------------
-Prepare for 2.5 kernel merge.  Remove ifdefs.
-
-Version 0.46
-------------
-Socket buffer management fixes.  Fix dual free.
-
-Version 0.45
-------------
-Various big endian fixes for hardlinks and symlinks and also for dfs.
-
-Version 0.44
-------------
-Various big endian fixes for servers with Unix extensions such as Samba
-
-Version 0.43
-------------
-Various FindNext fixes for incorrect filenames on large directory searches on big endian
-clients.  basic posix file i/o tests now work on big endian machines, not just le
-
-Version 0.42
-------------
-SessionSetup and NegotiateProtocol now work from Big Endian machines.
-Various Big Endian fixes found during testing on the Linux on 390.  Various fixes for compatibility with older
-versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
-
-Version 0.41
-------------
-Various minor fixes for Connectathon Posix "basic" file i/o test suite.  Directory caching fixed so hardlinked
-files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
-
-Version 0.40
-------------
-Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
-session advanced session authentication).  Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
-Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
-(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
-
-Version 0.38
-------------
-Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
-it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
-
-Version 0.37
-------------
-Rewrote much of connection and mount/unmount logic to handle bugs with
-multiple uses to same share, multiple users to same server etc.
-
-Version 0.36
-------------
-Fixed major problem with dentry corruption (missing call to dput)
-
-Version 0.35
-------------
-Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
-Begin adding oplock support.  Multiusermount and oplockEnabled flags added to /proc/fs/cifs
-although corresponding function not fully implemented in the vfs yet
-
-Version 0.34
-------------
-Fixed dentry caching bug, misc. cleanup 
-
-Version 0.33
-------------
-Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes.  Now can build
-on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
-Support for STATUS codes (newer 32 bit NT error codes) added.  DFS support begun to be added.
-
-Version 0.32
-------------
-Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
-and tested against Samba 2.2.5
-
-
-Version 0.31
-------------
-1) Fixed lockrange to be correct (it was one byte too short)
-
-2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly 
-show range as locked when there is a conflict with an existing lock.
-
-3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
-in most cases.  Eventually will offer optional ability to query server for the correct perms.
-
-3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded 
-but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
-session) 
-
-4) Fixed error logging of valid mount options
-
-5) Removed logging of password field.
-
-6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
-and cleaned them up and made them more consistent with other cifs functions. 
-
-7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways 
-(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
-nor is the symlink support using the Unix extensions
-
-8) Started adding the readlink and follow_link code 
-
-Version 0.3 
------------
-Initial drop
-
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 7cb0f7f847e..603f18a65c1 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -7,13 +7,17 @@ config CIFS
 	select CRYPTO_MD5
 	select CRYPTO_HMAC
 	select CRYPTO_ARC4
+	select CRYPTO_ECB
+	select CRYPTO_DES
+	select CRYPTO_SHA256
+	select CRYPTO_CMAC
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
 	  (SMB) protocol, the native file sharing mechanism for most early
 	  PC operating systems.  The CIFS protocol is fully supported by
-	  file servers such as Windows 2000 (including Windows 2003, NT 4
-	  and Windows XP) as well by Samba (which provides excellent CIFS
+	  file servers such as Windows 2000 (including Windows 2003, Windows 2008,
+	  NT 4 and Windows XP) as well by Samba (which provides excellent CIFS
 	  server support for Linux and many other operating systems). Limited
 	  support for OS/2 and Windows ME and similar servers is provided as
 	  well.
@@ -112,9 +116,24 @@ config CIFS_POSIX
 	  (such as Samba 3.10 and later) which can negotiate
 	  CIFS POSIX ACL support.  If unsure, say N.
 
+config CIFS_ACL
+	  bool "Provide CIFS ACL support"
+	  depends on CIFS_XATTR && KEYS
+	  help
+	    Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
+	    is handed over to the application/caller.
+
+config CIFS_DEBUG
+	bool "Enable CIFS debugging routines"
+	default y
+	depends on CIFS
+	help
+	   Enabling this option adds helpful debugging messages to
+	   the cifs code which increases the size of the cifs module.
+	   If unsure, say Y.
 config CIFS_DEBUG2
 	bool "Enable additional CIFS debugging routines"
-	depends on CIFS
+	depends on CIFS_DEBUG
 	help
 	   Enabling this option adds a few more debugging routines
 	   to the cifs code which slightly increases the size of
@@ -136,32 +155,37 @@ config CIFS_DFS_UPCALL
 	    IP addresses) which is needed for implicit mounts of DFS junction
 	    points. If unsure, say N.
 
+config CIFS_NFSD_EXPORT
+	  bool "Allow nfsd to export CIFS file system"
+	  depends on CIFS && BROKEN
+	  help
+	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+
+config CIFS_SMB2
+	bool "SMB2 network file system support"
+	depends on CIFS && INET
+	select NLS
+	select KEYS
+	select FSCACHE
+	select DNS_RESOLVER
+
+	help
+	  This enables experimental support for the SMB2 (Server Message Block
+	  version 2) protocol. The SMB2 protocol is the successor to the
+	  popular CIFS and SMB network file sharing protocols. SMB2 is the
+	  native file sharing mechanism for recent versions of Windows
+	  operating systems (since Vista).  SMB2 enablement will eventually
+	  allow users better performance, security and features, than would be
+	  possible with cifs. Note that smb2 mount options also are simpler
+	  (compared to cifs) due to protocol improvements.
+
+	  Unless you are a developer or tester, say N.
+
 config CIFS_FSCACHE
-	  bool "Provide CIFS client caching support (EXPERIMENTAL)"
-	  depends on EXPERIMENTAL
+	  bool "Provide CIFS client caching support"
 	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
 	  help
 	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
 	    to be cached locally on disk through the general filesystem cache
 	    manager. If unsure, say N.
 
-config CIFS_ACL
-	  bool "Provide CIFS ACL support (EXPERIMENTAL)"
-	  depends on EXPERIMENTAL && CIFS_XATTR
-	  help
-	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
-	    is handed over to the application/caller.
-
-config CIFS_EXPERIMENTAL
-	  bool "CIFS Experimental Features (EXPERIMENTAL)"
-	  depends on CIFS && EXPERIMENTAL
-	  help
-	    Enables cifs features under testing. These features are
-	    experimental and currently include DFS support and directory
-	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
-	    mechanism which will be used for Kerberos session negotiation
-	    and uid remapping.  Some of these features also may depend on
-	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
-	    (which is disabled by default). See the file fs/cifs/README
-	    for more details.  If unsure, say N.
-
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index d87558448e3..1964d212ab0 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -4,9 +4,9 @@
 obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
-	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
+	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
 	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-	  readdir.o ioctl.o sess.o export.o
+	  readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 
@@ -15,3 +15,6 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
 
 cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o smb2maperror.o smb2transport.o \
+			    smb2misc.o smb2pdu.o smb2inode.o smb2file.o
diff --git a/fs/cifs/README b/fs/cifs/README
deleted file mode 100644
index fe168359082..00000000000
--- a/fs/cifs/README
+++ /dev/null
@@ -1,773 +0,0 @@
-The CIFS VFS support for Linux supports many advanced network filesystem 
-features such as hierarchical dfs like namespace, hardlinks, locking and more.  
-It was designed to comply with the SNIA CIFS Technical Reference (which 
-supersedes the 1992 X/Open SMB Standard) as well as to perform best practice 
-practical interoperability with Windows 2000, Windows XP, Samba and equivalent 
-servers.  This code was developed in participation with the Protocol Freedom
-Information Foundation.
-
-Please see
-  http://protocolfreedom.org/ and
-  http://samba.org/samba/PFIF/
-for more details.
-
-
-For questions or bug reports please contact:
-    sfrench@samba.org (sfrench@us.ibm.com) 
-
-Build instructions:
-==================
-For Linux 2.4:
-1) Get the kernel source (e.g.from http://www.kernel.org)
-and download the cifs vfs source (see the project page
-at http://us1.samba.org/samba/Linux_CIFS_client.html)
-and change directory into the top of the kernel directory
-then patch the kernel (e.g. "patch -p1 < cifs_24.patch") 
-to add the cifs vfs to your kernel configure options if
-it has not already been added (e.g. current SuSE and UL
-users do not need to apply the cifs_24.patch since the cifs vfs is
-already in the kernel configure menu) and then
-mkdir linux/fs/cifs and then copy the current cifs vfs files from
-the cifs download to your kernel build directory e.g.
-
-	cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs
-	
-2) make menuconfig (or make xconfig)
-3) select cifs from within the network filesystem choices
-4) save and exit
-5) make dep
-6) make modules (or "make" if CIFS VFS not to be built as a module)
-
-For Linux 2.6:
-1) Download the kernel (e.g. from http://www.kernel.org)
-and change directory into the top of the kernel directory tree
-(e.g. /usr/src/linux-2.5.73)
-2) make menuconfig (or make xconfig)
-3) select cifs from within the network filesystem choices
-4) save and exit
-5) make
-
-
-Installation instructions:
-=========================
-If you have built the CIFS vfs as module (successfully) simply
-type "make modules_install" (or if you prefer, manually copy the file to
-the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o).
-
-If you have built the CIFS vfs into the kernel itself, follow the instructions
-for your distribution on how to install a new kernel (usually you
-would simply type "make install").
-
-If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on 
-the CIFS VFS web site) copy it to the same directory in which mount.smbfs and 
-similar files reside (usually /sbin).  Although the helper software is not  
-required, mount.cifs is recommended.  Eventually the Samba 3.0 utility program 
-"net" may also be helpful since it may someday provide easier mount syntax for
-users who are used to Windows e.g.
-	net use <mount point> <UNC name or cifs URL>
-Note that running the Winbind pam/nss module (logon service) on all of your
-Linux clients is useful in mapping Uids and Gids consistently across the
-domain to the proper network user.  The mount.cifs mount helper can be
-trivially built from Samba 3.0 or later source e.g. by executing:
-
-	gcc samba/source/client/mount.cifs.c -o mount.cifs
-
-If cifs is built as a module, then the size and number of network buffers
-and maximum number of simultaneous requests to one server can be configured.
-Changing these from their defaults is not recommended. By executing modinfo
-	modinfo kernel/fs/cifs/cifs.ko
-on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made
-at module initialization time (by running insmod cifs.ko) can be seen.
-
-Allowing User Mounts
-====================
-To permit users to mount and unmount over directories they own is possible
-with the cifs vfs.  A way to enable such mounting is to mark the mount.cifs
-utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to 
-umount shares they mount requires
-1) mount.cifs version 1.4 or later
-2) an entry for the share in /etc/fstab indicating that a user may
-unmount it e.g.
-//server/usersharename  /mnt/username cifs user 0 0
-
-Note that when the mount.cifs utility is run suid (allowing user mounts), 
-in order to reduce risks, the "nosuid" mount flag is passed in on mount to
-disallow execution of an suid program mounted on the remote target.
-When mount is executed as root, nosuid is not passed in by default,
-and execution of suid programs on the remote target would be enabled
-by default. This can be changed, as with nfs and other filesystems, 
-by simply specifying "nosuid" among the mount options. For user mounts 
-though to be able to pass the suid flag to mount requires rebuilding 
-mount.cifs with the following flag: 
- 
-        gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs
-
-There is a corresponding manual page for cifs mounting in the Samba 3.0 and
-later source tree in docs/manpages/mount.cifs.8 
-
-Allowing User Unmounts
-======================
-To permit users to ummount directories that they have user mounted (see above),
-the utility umount.cifs may be used.  It may be invoked directly, or if 
-umount.cifs is placed in /sbin, umount can invoke the cifs umount helper
-(at least for most versions of the umount utility) for umount of cifs
-mounts, unless umount is invoked with -i (which will avoid invoking a umount
-helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked
-as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions
-allow adding entries to a file to the /etc/permissions file to achieve the
-equivalent suid effect).  For this utility to succeed the target path
-must be a cifs mount, and the uid of the current user must match the uid
-of the user who mounted the resource.
-
-Also note that the customary way of allowing user mounts and unmounts is 
-(instead of using mount.cifs and unmount.cifs as suid) to add a line
-to the file /etc/fstab for each //server/share you wish to mount, but
-this can become unwieldy when potential mount targets include many
-or  unpredictable UNC names.
-
-Samba Considerations 
-==================== 
-To get the maximum benefit from the CIFS VFS, we recommend using a server that 
-supports the SNIA CIFS Unix Extensions standard (e.g.  Samba 2.2.5 or later or 
-Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers.  
-Note that uid, gid and file permissions will display default values if you do 
-not have a server that supports the Unix extensions for CIFS (such as Samba 
-2.2.5 or later).  To enable the Unix CIFS Extensions in the Samba server, add 
-the line: 
-
-	unix extensions = yes
-	
-to your smb.conf file on the server.  Note that the following smb.conf settings 
-are also useful (on the Samba server) when the majority of clients are Unix or 
-Linux: 
-
-	case sensitive = yes
-	delete readonly = yes 
-	ea support = yes
-
-Note that server ea support is required for supporting xattrs from the Linux
-cifs client, and that EA support is present in later versions of Samba (e.g. 
-3.0.6 and later (also EA support works in all versions of Windows, at least to
-shares on NTFS filesystems).  Extended Attribute (xattr) support is an optional
-feature of most Linux filesystems which may require enabling via
-make menuconfig. Client support for extended attributes (user xattr) can be
-disabled on a per-mount basis by specifying "nouser_xattr" on mount.
-
-The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers
-version 3.10 and later.  Setting POSIX ACLs requires enabling both XATTR and 
-then POSIX support in the CIFS configuration options when building the cifs
-module.  POSIX ACL support can be disabled on a per mount basic by specifying
-"noacl" on mount.
- 
-Some administrators may want to change Samba's smb.conf "map archive" and 
-"create mask" parameters from the default.  Unless the create mask is changed
-newly created files can end up with an unnecessarily restrictive default mode,
-which may not be what you want, although if the CIFS Unix extensions are
-enabled on the server and client, subsequent setattr calls (e.g. chmod) can
-fix the mode.  Note that creating special devices (mknod) remotely 
-may require specifying a mkdev function to Samba if you are not using 
-Samba 3.0.6 or later.  For more information on these see the manual pages
-("man smb.conf") on the Samba server system.  Note that the cifs vfs,
-unlike the smbfs vfs, does not read the smb.conf on the client system 
-(the few optional settings are passed in on mount via -o parameters instead).  
-Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete
-open files (required for strict POSIX compliance).  Windows Servers already 
-supported this feature. Samba server does not allow symlinks that refer to files
-outside of the share, so in Samba versions prior to 3.0.6, most symlinks to
-files with absolute paths (ie beginning with slash) such as:
-	 ln -s /mnt/foo bar
-would be forbidden. Samba 3.0.6 server or later includes the ability to create 
-such symlinks safely by converting unsafe symlinks (ie symlinks to server 
-files that are outside of the share) to a samba specific format on the server
-that is ignored by local server applications and non-cifs clients and that will
-not be traversed by the Samba server).  This is opaque to the Linux client
-application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or
-later, but only for remote clients using the CIFS Unix extensions, and will
-be invisbile to Windows clients and typically will not affect local
-applications running on the same server as Samba.  
-
-Use instructions:
-================
-Once the CIFS VFS support is built into the kernel or installed as a module 
-(cifs.o), you can use mount syntax like the following to access Samba or Windows 
-servers: 
-
-  mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword
-
-Before -o the option -v may be specified to make the mount.cifs
-mount helper display the mount steps more verbosely.  
-After -o the following commonly used cifs vfs specific options
-are supported:
-
-  user=<username>
-  pass=<password>
-  domain=<domain name>
-  
-Other cifs mount options are described below.  Use of TCP names (in addition to
-ip addresses) is available if the mount helper (mount.cifs) is installed. If
-you do not trust the server to which are mounted, or if you do not have
-cifs signing enabled (and the physical network is insecure), consider use
-of the standard mount options "noexec" and "nosuid" to reduce the risk of 
-running an altered binary on your local system (downloaded from a hostile server
-or altered by a hostile router).
-
-Although mounting using format corresponding to the CIFS URL specification is
-not possible in mount.cifs yet, it is possible to use an alternate format
-for the server and sharename (which is somewhat similar to NFS style mount
-syntax) instead of the more widely used UNC format (i.e. \\server\share):
-  mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd
-
-When using the mount helper mount.cifs, passwords may be specified via alternate
-mechanisms, instead of specifying it after -o using the normal "pass=" syntax
-on the command line:
-1) By including it in a credential file. Specify credentials=filename as one
-of the mount options. Credential files contain two lines
-        username=someuser
-        password=your_password
-2) By specifying the password in the PASSWD environment variable (similarly
-the user name can be taken from the USER environment variable).
-3) By specifying the password in a file by name via PASSWD_FILE
-4) By specifying the password in a file by file descriptor via PASSWD_FD
-
-If no password is provided, mount.cifs will prompt for password entry
-
-Restrictions
-============
-Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC 
-1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a 
-problem as most servers support this.
-
-Valid filenames differ between Windows and Linux.  Windows typically restricts
-filenames which contain certain reserved characters (e.g.the character : 
-which is used to delimit the beginning of a stream name by Windows), while
-Linux allows a slightly wider set of valid characters in filenames. Windows
-servers can remap such characters when an explicit mapping is specified in
-the Server's registry.  Samba starting with version 3.10 will allow such 
-filenames (ie those which contain valid Linux characters, which normally
-would be forbidden for Windows/CIFS semantics) as long as the server is
-configured for Unix Extensions (and the client has not disabled
-/proc/fs/cifs/LinuxExtensionsEnabled).
-  
-
-CIFS VFS Mount Options
-======================
-A partial list of the supported mount options follows:
-  user		The user name to use when trying to establish
-		the CIFS session.
-  password	The user password.  If the mount helper is
-		installed, the user will be prompted for password
-		if not supplied.
-  ip		The ip address of the target server
-  unc		The target server Universal Network Name (export) to 
-		mount.	
-  domain	Set the SMB/CIFS workgroup name prepended to the
-		username during CIFS session establishment
-  forceuid	Set the default uid for inodes to the uid
-		passed in on mount. For mounts to servers
-		which do support the CIFS Unix extensions, such as a
-		properly configured Samba server, the server provides
-		the uid, gid and mode so this parameter should not be
-		specified unless the server and clients uid and gid
-		numbering differ.  If the server and client are in the
-		same domain (e.g. running winbind or nss_ldap) and
-		the server supports the Unix Extensions then the uid
-		and gid can be retrieved from the server (and uid
-		and gid would not have to be specifed on the mount. 
-		For servers which do not support the CIFS Unix
-		extensions, the default uid (and gid) returned on lookup
-		of existing files will be the uid (gid) of the person
-		who executed the mount (root, except when mount.cifs
-		is configured setuid for user mounts) unless the "uid=" 
-		(gid) mount option is specified. Also note that permission
-		checks (authorization checks) on accesses to a file occur
-		at the server, but there are cases in which an administrator
-		may want to restrict at the client as well.  For those
-		servers which do not report a uid/gid owner
-		(such as Windows), permissions can also be checked at the
-		client, and a crude form of client side permission checking 
-		can be enabled by specifying file_mode and dir_mode on 
-		the client.  (default)
-  forcegid	(similar to above but for the groupid instead of uid) (default)
-  noforceuid	Fill in file owner information (uid) by requesting it from
-		the server if possible. With this option, the value given in
-		the uid= option (on mount) will only be used if the server
-		can not support returning uids on inodes.
-  noforcegid	(similar to above but for the group owner, gid, instead of uid)
-  uid		Set the default uid for inodes, and indicate to the
-		cifs kernel driver which local user mounted. If the server
-		supports the unix extensions the default uid is
-		not used to fill in the owner fields of inodes (files)
-		unless the "forceuid" parameter is specified.
-  gid		Set the default gid for inodes (similar to above).
-  file_mode     If CIFS Unix extensions are not supported by the server
-		this overrides the default mode for file inodes.
-  fsc		Enable local disk caching using FS-Cache (off by default). This
-  		option could be useful to improve performance on a slow link,
-		heavily loaded server and/or network where reading from the
-		disk is faster than reading from the server (over the network).
-		This could also impact scalability positively as the
-		number of calls to the server are reduced. However, local
-		caching is not suitable for all workloads for e.g. read-once
-		type workloads. So, you need to consider carefully your
-		workload/scenario before using this option. Currently, local
-		disk caching is functional for CIFS files opened as read-only.
-  dir_mode      If CIFS Unix extensions are not supported by the server 
-		this overrides the default mode for directory inodes.
-  port		attempt to contact the server on this tcp port, before
-		trying the usual ports (port 445, then 139).
-  iocharset     Codepage used to convert local path names to and from
-		Unicode. Unicode is used by default for network path
-		names if the server supports it.  If iocharset is
-		not specified then the nls_default specified
-		during the local client kernel build will be used.
-		If server does not support Unicode, this parameter is
-		unused.
-  rsize		default read size (usually 16K). The client currently
-		can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize
-		defaults to 16K and may be changed (from 8K to the maximum
-		kmalloc size allowed by your kernel) at module install time
-		for cifs.ko. Setting CIFSMaxBufSize to a very large value
-		will cause cifs to use more memory and may reduce performance
-		in some cases.  To use rsize greater than 127K (the original
-		cifs protocol maximum) also requires that the server support
-		a new Unix Capability flag (for very large read) which some
-		newer servers (e.g. Samba 3.0.26 or later) do. rsize can be
-		set from a minimum of 2048 to a maximum of 130048 (127K or
-		CIFSMaxBufSize, whichever is smaller)
-  wsize		default write size (default 57344)
-		maximum wsize currently allowed by CIFS is 57344 (fourteen
-		4096 byte pages)
-  actimeo=n	attribute cache timeout in seconds (default 1 second).
-		After this timeout, the cifs client requests fresh attribute
-		information from the server. This option allows to tune the
-		attribute cache timeout to suit the workload needs. Shorter
-		timeouts mean better the cache coherency, but increased number
-		of calls to the server. Longer timeouts mean reduced number
-		of calls to the server at the expense of less stricter cache
-		coherency checks (i.e. incorrect attribute cache for a short
-		period of time).
-  rw		mount the network share read-write (note that the
-		server may still consider the share read-only)
-  ro		mount network share read-only
-  version	used to distinguish different versions of the
-		mount helper utility (not typically needed)
-  sep		if first mount option (after the -o), overrides
-		the comma as the separator between the mount
-		parms. e.g.
-			-o user=myname,password=mypassword,domain=mydom
-		could be passed instead with period as the separator by
-			-o sep=.user=myname.password=mypassword.domain=mydom
-		this might be useful when comma is contained within username
-		or password or domain. This option is less important
-		when the cifs mount helper cifs.mount (version 1.1 or later)
-		is used.
-  nosuid        Do not allow remote executables with the suid bit 
-		program to be executed.  This is only meaningful for mounts
-		to servers such as Samba which support the CIFS Unix Extensions.
-		If you do not trust the servers in your network (your mount
-		targets) it is recommended that you specify this option for
-		greater security.
-  exec		Permit execution of binaries on the mount.
-  noexec	Do not permit execution of binaries on the mount.
-  dev		Recognize block devices on the remote mount.
-  nodev		Do not recognize devices on the remote mount.
-  suid          Allow remote files on this mountpoint with suid enabled to 
-		be executed (default for mounts when executed as root,
-		nosuid is default for user mounts).
-  credentials   Although ignored by the cifs kernel component, it is used by 
-		the mount helper, mount.cifs. When mount.cifs is installed it
-		opens and reads the credential file specified in order  
-		to obtain the userid and password arguments which are passed to
-		the cifs vfs.
-  guest         Although ignored by the kernel component, the mount.cifs
-		mount helper will not prompt the user for a password
-		if guest is specified on the mount options.  If no
-		password is specified a null password will be used.
-  perm          Client does permission checks (vfs_permission check of uid
-		and gid of the file against the mode and desired operation),
-		Note that this is in addition to the normal ACL check on the
-		target machine done by the server software. 
-		Client permission checking is enabled by default.
-  noperm        Client does not do permission checks.  This can expose
-		files on this mount to access by other users on the local
-		client system. It is typically only needed when the server
-		supports the CIFS Unix Extensions but the UIDs/GIDs on the
-		client and server system do not match closely enough to allow
-		access by the user doing the mount, but it may be useful with
-		non CIFS Unix Extension mounts for cases in which the default
-		mode is specified on the mount but is not to be enforced on the
-		client (e.g. perhaps when MultiUserMount is enabled)
-		Note that this does not affect the normal ACL check on the
-		target machine done by the server software (of the server
-		ACL against the user name provided at mount time).
-  serverino	Use server's inode numbers instead of generating automatically
-		incrementing inode numbers on the client.  Although this will
-		make it easier to spot hardlinked files (as they will have
-		the same inode numbers) and inode numbers may be persistent,
-		note that the server does not guarantee that the inode numbers
-		are unique if multiple server side mounts are exported under a
-		single share (since inode numbers on the servers might not
-		be unique if multiple filesystems are mounted under the same
-		shared higher level directory).  Note that some older
-		(e.g. pre-Windows 2000) do not support returning UniqueIDs
-		or the CIFS Unix Extensions equivalent and for those
-		this mount option will have no effect.  Exporting cifs mounts
-		under nfsd requires this mount option on the cifs mount.
-		This is now the default if server supports the 
-		required network operation.
-  noserverino   Client generates inode numbers (rather than using the actual one
-		from the server). These inode numbers will vary after
-		unmount or reboot which can confuse some applications,
-		but not all server filesystems support unique inode
-		numbers.
-  setuids       If the CIFS Unix extensions are negotiated with the server
-		the client will attempt to set the effective uid and gid of
-		the local process on newly created files, directories, and
-		devices (create, mkdir, mknod).  If the CIFS Unix Extensions
-		are not negotiated, for newly created files and directories
-		instead of using the default uid and gid specified on
-		the mount, cache the new file's uid and gid locally which means
-		that the uid for the file can change when the inode is
-	        reloaded (or the user remounts the share).
-  nosetuids     The client will not attempt to set the uid and gid on
-		on newly created files, directories, and devices (create, 
-		mkdir, mknod) which will result in the server setting the
-		uid and gid to the default (usually the server uid of the
-		user who mounted the share).  Letting the server (rather than
-		the client) set the uid and gid is the default. If the CIFS
-		Unix Extensions are not negotiated then the uid and gid for
-		new files will appear to be the uid (gid) of the mounter or the
-		uid (gid) parameter specified on the mount.
-  netbiosname   When mounting to servers via port 139, specifies the RFC1001
-		source name to use to represent the client netbios machine 
-		name when doing the RFC1001 netbios session initialize.
-  direct        Do not do inode data caching on files opened on this mount.
-		This precludes mmapping files on this mount. In some cases
-		with fast networks and little or no caching benefits on the
-		client (e.g. when the application is doing large sequential
-		reads bigger than page size without rereading the same data) 
-		this can provide better performance than the default
-		behavior which caches reads (readahead) and writes 
-		(writebehind) through the local Linux client pagecache 
-		if oplock (caching token) is granted and held. Note that
-		direct allows write operations larger than page size
-		to be sent to the server.
-  strictcache   Use for switching on strict cache mode. In this mode the
-		client read from the cache all the time it has Oplock Level II,
-		otherwise - read from the server. All written data are stored
-		in the cache, but if the client doesn't have Exclusive Oplock,
-		it writes the data to the server.
-  acl   	Allow setfacl and getfacl to manage posix ACLs if server
-		supports them.  (default)
-  noacl 	Do not allow setfacl and getfacl calls on this mount
-  user_xattr    Allow getting and setting user xattrs (those attributes whose
-		name begins with "user." or "os2.") as OS/2 EAs (extended
-		attributes) to the server.  This allows support of the
-		setfattr and getfattr utilities. (default)
-  nouser_xattr  Do not allow getfattr/setfattr to get/set/list xattrs 
-  mapchars      Translate six of the seven reserved characters (not backslash)
-			*?<>|:
-		to the remap range (above 0xF000), which also
-		allows the CIFS client to recognize files created with
-		such characters by Windows's POSIX emulation. This can
-		also be useful when mounting to most versions of Samba
-		(which also forbids creating and opening files
-		whose names contain any of these seven characters).
-		This has no effect if the server does not support
-		Unicode on the wire.
- nomapchars     Do not translate any of these seven characters (default).
- nocase         Request case insensitive path name matching (case
-		sensitive is the default if the server suports it).
-		(mount option "ignorecase" is identical to "nocase")
- posixpaths     If CIFS Unix extensions are supported, attempt to
-		negotiate posix path name support which allows certain
-		characters forbidden in typical CIFS filenames, without
-		requiring remapping. (default)
- noposixpaths   If CIFS Unix extensions are supported, do not request
-		posix path name support (this may cause servers to
-		reject creatingfile with certain reserved characters).
- nounix         Disable the CIFS Unix Extensions for this mount (tree
-		connection). This is rarely needed, but it may be useful
-		in order to turn off multiple settings all at once (ie
-		posix acls, posix locks, posix paths, symlink support
-		and retrieving uids/gids/mode from the server) or to
-		work around a bug in server which implement the Unix
-		Extensions.
- nobrl          Do not send byte range lock requests to the server.
-		This is necessary for certain applications that break
-		with cifs style mandatory byte range locks (and most
-		cifs servers do not yet support requesting advisory
-		byte range locks).
- forcemandatorylock Even if the server supports posix (advisory) byte range
-		locking, send only mandatory lock requests.  For some
-		(presumably rare) applications, originally coded for
-		DOS/Windows, which require Windows style mandatory byte range
-		locking, they may be able to take advantage of this option,
-		forcing the cifs client to only send mandatory locks
-		even if the cifs server would support posix advisory locks.
-		"forcemand" is accepted as a shorter form of this mount
-		option.
- nostrictsync   If this mount option is set, when an application does an
-		fsync call then the cifs client does not send an SMB Flush
-		to the server (to force the server to write all dirty data
-		for this file immediately to disk), although cifs still sends
-		all dirty (cached) file data to the server and waits for the
-		server to respond to the write.  Since SMB Flush can be
-		very slow, and some servers may be reliable enough (to risk
-		delaying slightly flushing the data to disk on the server),
-		turning on this option may be useful to improve performance for
-		applications that fsync too much, at a small risk of server
-		crash.  If this mount option is not set, by default cifs will
-		send an SMB flush request (and wait for a response) on every
-		fsync call.
- nodfs          Disable DFS (global name space support) even if the
-		server claims to support it.  This can help work around
-		a problem with parsing of DFS paths with Samba server
-		versions 3.0.24 and 3.0.25.
- remount        remount the share (often used to change from ro to rw mounts
-	        or vice versa)
- cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
-	        the file. (EXPERIMENTAL)
- servern        Specify the server 's netbios name (RFC1001 name) to use
-		when attempting to setup a session to the server. 
-		This is needed for mounting to some older servers (such
-		as OS/2 or Windows 98 and Windows ME) since they do not
-		support a default server name.  A server name can be up
-		to 15 characters long and is usually uppercased.
- sfu            When the CIFS Unix Extensions are not negotiated, attempt to
-		create device files and fifos in a format compatible with
-		Services for Unix (SFU).  In addition retrieve bits 10-12
-		of the mode via the SETFILEBITS extended attribute (as
-		SFU does).  In the future the bottom 9 bits of the
-		mode also will be emulated using queries of the security
-		descriptor (ACL).
- mfsymlinks     Enable support for Minshall+French symlinks
-		(see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
-		This option is ignored when specified together with the
-		'sfu' option. Minshall+French symlinks are used even if
-		the server supports the CIFS Unix Extensions.
- sign           Must use packet signing (helps avoid unwanted data modification
-		by intermediate systems in the route).  Note that signing
-		does not work with lanman or plaintext authentication.
- seal           Must seal (encrypt) all data on this mounted share before
-		sending on the network.  Requires support for Unix Extensions.
-		Note that this differs from the sign mount option in that it
-		causes encryption of data sent over this mounted share but other
-		shares mounted to the same server are unaffected.
- locallease     This option is rarely needed. Fcntl F_SETLEASE is
-		used by some applications such as Samba and NFSv4 server to
-		check to see whether a file is cacheable.  CIFS has no way
-		to explicitly request a lease, but can check whether a file
-		is cacheable (oplocked).  Unfortunately, even if a file
-		is not oplocked, it could still be cacheable (ie cifs client
-		could grant fcntl leases if no other local processes are using
-		the file) for cases for example such as when the server does not
-		support oplocks and the user is sure that the only updates to
-		the file will be from this client. Specifying this mount option
-		will allow the cifs client to check for leases (only) locally
-		for files which are not oplocked instead of denying leases
-		in that case. (EXPERIMENTAL)
- sec            Security mode.  Allowed values are:
-			none	attempt to connection as a null user (no name)
-			krb5    Use Kerberos version 5 authentication
-			krb5i   Use Kerberos authentication and packet signing
-			ntlm    Use NTLM password hashing (default)
-			ntlmi   Use NTLM password hashing with signing (if
-				/proc/fs/cifs/PacketSigningEnabled on or if
-				server requires signing also can be the default) 
-			ntlmv2  Use NTLMv2 password hashing      
-			ntlmv2i Use NTLMv2 password hashing with packet signing
-			lanman  (if configured in kernel config) use older
-				lanman hash
-hard		Retry file operations if server is not responding
-soft		Limit retries to unresponsive servers (usually only
-		one retry) before returning an error.  (default)
-
-The mount.cifs mount helper also accepts a few mount options before -o
-including:
-
-	-S      take password from stdin (equivalent to setting the environment
-		variable "PASSWD_FD=0"
-	-V      print mount.cifs version
-	-?      display simple usage information
-
-With most 2.6 kernel versions of modutils, the version of the cifs kernel
-module can be displayed via modinfo.
-
-Misc /proc/fs/cifs Flags and Debug Info
-=======================================
-Informational pseudo-files:
-DebugData		Displays information about active CIFS sessions and
-			shares, features enabled as well as the cifs.ko
-			version.
-Stats			Lists summary resource usage information as well as per
-			share statistics, if CONFIG_CIFS_STATS in enabled
-			in the kernel configuration.
-
-Configuration pseudo-files:
-MultiuserMount		If set to one, more than one CIFS session to 
-			the same server ip address can be established
-			if more than one uid accesses the same mount
-			point and if the uids user/password mapping
-			information is available. (default is 0)
-PacketSigningEnabled	If set to one, cifs packet signing is enabled
-			and will be used if the server requires 
-			it.  If set to two, cifs packet signing is
-			required even if the server considers packet
-			signing optional. (default 1)
-SecurityFlags		Flags which control security negotiation and
-			also packet signing. Authentication (may/must)
-			flags (e.g. for NTLM and/or NTLMv2) may be combined with
-			the signing flags.  Specifying two different password
-			hashing mechanisms (as "must use") on the other hand 
-			does not make much sense. Default flags are 
-				0x07007 
-			(NTLM, NTLMv2 and packet signing allowed).  The maximum 
-			allowable flags if you want to allow mounts to servers
-			using weaker password hashes is 0x37037 (lanman,
-			plaintext, ntlm, ntlmv2, signing allowed).  Some
-			SecurityFlags require the corresponding menuconfig
-			options to be enabled (lanman and plaintext require
-			CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
-			plaintext authentication currently requires also
-			enabling lanman authentication in the security flags
-			because the cifs module only supports sending
-			laintext passwords using the older lanman dialect
-			form of the session setup SMB.  (e.g. for authentication
-			using plain text passwords, set the SecurityFlags
-			to 0x30030):
- 
-			may use packet signing 				0x00001
-			must use packet signing				0x01001
-			may use NTLM (most common password hash)	0x00002
-			must use NTLM					0x02002
-			may use NTLMv2					0x00004
-			must use NTLMv2					0x04004
-			may use Kerberos security			0x00008
-			must use Kerberos				0x08008
-			may use lanman (weak) password hash  		0x00010
-			must use lanman password hash			0x10010
-			may use plaintext passwords    			0x00020
-			must use plaintext passwords			0x20020
-			(reserved for future packet encryption)		0x00040
-
-cifsFYI			If set to non-zero value, additional debug information
-			will be logged to the system error log.  This field
-			contains three flags controlling different classes of
-			debugging entries.  The maximum value it can be set
-			to is 7 which enables all debugging points (default 0).
-			Some debugging statements are not compiled into the
-			cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
-			kernel configuration. cifsFYI may be set to one or
-			nore of the following flags (7 sets them all):
-
-			log cifs informational messages			0x01
-			log return codes from cifs entry points		0x02
-			log slow responses (ie which take longer than 1 second)
-			  CONFIG_CIFS_STATS2 must be enabled in .config	0x04
-				
-				
-traceSMB		If set to one, debug information is logged to the
-			system error log with the start of smb requests
-			and responses (default 0)
-LookupCacheEnable	If set to one, inode information is kept cached
-			for one second improving performance of lookups
-			(default 1)
-OplockEnabled		If set to one, safe distributed caching enabled.
-			(default 1)
-LinuxExtensionsEnabled	If set to one then the client will attempt to
-			use the CIFS "UNIX" extensions which are optional
-			protocol enhancements that allow CIFS servers
-			to return accurate UID/GID information as well
-			as support symbolic links. If you use servers
-			such as Samba that support the CIFS Unix
-			extensions but do not want to use symbolic link
-			support and want to map the uid and gid fields 
-			to values supplied at mount (rather than the 
-			actual values, then set this to zero. (default 1)
-Experimental            When set to 1 used to enable certain experimental
-			features (currently enables multipage writes
-			when signing is enabled, the multipage write
-			performance enhancement was disabled when
-			signing turned on in case buffer was modified
-			just before it was sent, also this flag will
-			be used to use the new experimental directory change 
-			notification code).  When set to 2 enables
-			an additional experimental feature, "raw ntlmssp"
-			session establishment support (which allows
-			specifying "sec=ntlmssp" on mount). The Linux cifs
-			module will use ntlmv2 authentication encapsulated
-			in "raw ntlmssp" (not using SPNEGO) when
-			"sec=ntlmssp" is specified on mount.
-			This support also requires building cifs with
-			the CONFIG_CIFS_EXPERIMENTAL configuration flag.
-
-These experimental features and tracing can be enabled by changing flags in 
-/proc/fs/cifs (after the cifs module has been installed or built into the 
-kernel, e.g.  insmod cifs).  To enable a feature set it to 1 e.g.  to enable 
-tracing to the kernel message log type: 
-
-	echo 7 > /proc/fs/cifs/cifsFYI
-	
-cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
-logging of various informational messages.  2 enables logging of non-zero
-SMB return codes while 4 enables logging of requests that take longer
-than one second to complete (except for byte range lock requests). 
-Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
-source code (typically by setting it in the beginning of cifsglob.h),
-and setting it to seven enables all three.  Finally, tracing
-the start of smb requests and responses can be enabled via:
-
-	echo 1 > /proc/fs/cifs/traceSMB
-
-Two other experimental features are under development. To test these
-requires enabling CONFIG_CIFS_EXPERIMENTAL
-
-	cifsacl support needed to retrieve approximated mode bits based on
-		the contents on the CIFS ACL.
-
-	lease support: cifs will check the oplock state before calling into
-	the vfs to see if we can grant a lease on a file.
-
-	DNOTIFY fcntl: needed for support of directory change 
-			    notification and perhaps later for file leases)
-
-Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
-if the kernel was configured with cifs statistics enabled.  The statistics
-represent the number of successful (ie non-zero return code from the server) 
-SMB responses to some of the more common commands (open, delete, mkdir etc.).
-Also recorded is the total bytes read and bytes written to the server for
-that share.  Note that due to client caching effects this can be less than the
-number of bytes read and written by the application running on the client.
-The statistics for the number of total SMBs and oplock breaks are different in
-that they represent all for that share, not just those for which the server
-returned success.
-	
-Also note that "cat /proc/fs/cifs/DebugData" will display information about
-the active sessions and the shares that are mounted.
-
-Enabling Kerberos (extended security) works but requires version 1.2 or later
-of the helper program cifs.upcall to be present and to be configured in the
-/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba
-project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
-require this helper. Note that NTLMv2 security (which does not require the
-cifs.upcall helper program), instead of using Kerberos, is sufficient for
-some use cases.
-
-DFS support allows transparent redirection to shares in an MS-DFS name space.
-In addition, DFS support for target shares which are specified as UNC
-names which begin with host names (rather than IP addresses) requires
-a user space helper (such as cifs.upcall) to be present in order to
-translate host names to ip address, and the user space helper must also
-be configured in the file /etc/request-key.conf.  Samba, Windows servers and
-many NAS appliances support DFS as a way of constructing a global name
-space to ease network configuration and improve reliability.
-
-To use cifs Kerberos and DFS support, the Linux keyutils package should be
-installed and something like the following lines should be added to the
-/etc/request-key.conf file:
-
-create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
-create dns_resolver * * /usr/local/sbin/cifs.upcall %k
-
-
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
deleted file mode 100644
index 355abcdcda9..00000000000
--- a/fs/cifs/TODO
+++ /dev/null
@@ -1,129 +0,0 @@
-Version 1.53 May 20, 2008
-
-A Partial List of Missing Features
-==================================
-
-Contributions are welcome.  There are plenty of opportunities
-for visible, important contributions to this module.  Here
-is a partial list of the known problems and missing features:
-
-a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
-so that these operations can be supported to Windows servers
-
-b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
-SecurityDescriptors
-
-c) Better pam/winbind integration (e.g. to handle uid mapping
-better)
-
-d) Cleanup now unneeded SessSetup code in
-fs/cifs/connect.c and add back in NTLMSSP code if any servers
-need it
-
-e) fix NTLMv2 signing when two mounts with different users to same
-server.
-
-f) Directory entry caching relies on a 1 second timer, rather than 
-using FindNotify or equivalent.  - (started)
-
-g) quota support (needs minor kernel change since quota calls
-to make it to network filesystems or deviceless filesystems)
-
-h) investigate sync behavior (including syncpage) and check  
-for proper behavior of intr/nointr
-
-i) improve support for very old servers (OS/2 and Win9x for example)
-Including support for changing the time remotely (utimes command).
-
-j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
-extra copy in/out of the socket buffers in some cases.
-
-k) Better optimize open (and pathbased setfilesize) to reduce the
-oplock breaks coming from windows srv.  Piggyback identical file
-opens on top of each other by incrementing reference count rather
-than resending (helps reduce server resource utilization and avoid
-spurious oplock breaks).
-
-l) Improve performance of readpages by sending more than one read
-at a time when 8 pages or more are requested. In conjuntion
-add support for async_cifs_readpages.
-
-m) Add support for storing symlink info to Windows servers 
-in the Extended Attribute format their SFU clients would recognize.
-
-n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
-will autorefresh (partially complete by Asser). Needs minor kernel
-vfs change to support removing D_NOTIFY on a file.   
-
-o) Add GUI tool to configure /proc/fs/cifs settings and for display of
-the CIFS statistics (started)
-
-p) implement support for security and trusted categories of xattrs
-(requires minor protocol extension) to enable better support for SELINUX
-
-q) Implement O_DIRECT flag on open (already supported on mount)
-
-r) Create UID mapping facility so server UIDs can be mapped on a per
-mount or a per server basis to client UIDs or nobody if no mapping
-exists.  This is helpful when Unix extensions are negotiated to
-allow better permission checking when UIDs differ on the server
-and client.  Add new protocol request to the CIFS protocol 
-standard for asking the server for the corresponding name of a
-particular uid.
-
-s) Add support for CIFS Unix and also the newer POSIX extensions to the
-server side for Samba 4.
-
-t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
-need to add ability to set time to server (utimes command)
-
-u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
-
-v) mount check for unmatched uids
-
-w) Add support for new vfs entry point for fallocate
-
-x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
-processes can proceed better in parallel (on the server)
-
-y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
-restriction of wsize max being 127K) 
-
-KNOWN BUGS (updated April 24, 2007)
-====================================
-See http://bugzilla.samba.org - search on product "CifsVFS" for
-current bug list.
-
-1) existing symbolic links (Windows reparse points) are recognized but
-can not be created remotely. They are implemented for Samba and those that
-support the CIFS Unix extensions, although earlier versions of Samba
-overly restrict the pathnames.
-2) follow_link and readdir code does not follow dfs junctions
-but recognizes them
-3) create of new files to FAT partitions on Windows servers can
-succeed but still return access denied (appears to be Windows 
-server not cifs client problem) and has not been reproduced recently.
-NTFS partitions do not have this problem.
-4) Unix/POSIX capabilities are reset after reconnection, and affect
-a few fields in the tree connection but we do do not know which
-superblocks to apply these changes to.  We should probably walk
-the list of superblocks to set these.  Also need to check the
-flags on the second mount to the same share, and see if we
-can do the same trick that NFS does to remount duplicate shares.
-
-Misc testing to do
-==================
-1) check out max path names and max path name components against various server
-types. Try nested symlinks (8 deep). Return max path name in stat -f information
-
-2) Modify file portion of ltp so it can run against a mounted network
-share and run it against cifs vfs in automated fashion.
-
-3) Additional performance testing and optimization using iozone and similar - 
-there are some easy changes that can be done to parallelize sequential writes,
-and when signing is disabled to request larger read sizes (larger than 
-negotiated size) and send larger write sizes to modern servers.
-
-4) More exhaustively test against less common servers.  More testing
-against Windows 9x, Windows ME servers.
-
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cfd1ce34e0b..a3b56544c21 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -506,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* GSSAPI header */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding negTokenInit header");
+		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
 		return 0;
 	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag);
 		return 0;
 	}
 
@@ -531,52 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* SPNEGO OID not present or garbled -- bail out */
 	if (!rc) {
-		cFYI(1, "Error decoding negTokenInit header");
+		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
 		return 0;
 	}
 
 	/* SPNEGO */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding negTokenInit");
+		cifs_dbg(FYI, "Error decoding negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* negTokenInit */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding negTokenInit");
+		cifs_dbg(FYI, "Error decoding negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding 2nd part of negTokenInit");
+		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence of */
 	if (asn1_header_decode
 	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding 2nd part of negTokenInit");
+		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
@@ -584,15 +584,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	while (!asn1_eoc_decode(&ctx, sequence_end)) {
 		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
 		if (!rc) {
-			cFYI(1, "Error decoding negTokenInit hdr exit2");
+			cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n");
 			return 0;
 		}
 		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
 			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
 
-				cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
-					"0x%lx 0x%lx", oidlen, *oid,
-					*(oid + 1), *(oid + 2), *(oid + 3));
+				cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n",
+					 oidlen, *oid, *(oid + 1), *(oid + 2),
+					 *(oid + 3));
 
 				if (compare_oid(oid, oidlen, MSKRB5_OID,
 						MSKRB5_OID_LEN))
@@ -610,57 +610,14 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 				kfree(oid);
 			}
 		} else {
-			cFYI(1, "Should be an oid what is going on?");
+			cifs_dbg(FYI, "Should be an oid what is going on?\n");
 		}
 	}
 
-	/* mechlistMIC */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		/* Check if we have reached the end of the blob, but with
-		   no mechListMic (e.g. NTLMSSP instead of KRB5) */
-		if (ctx.error == ASN1_ERR_DEC_EMPTY)
-			goto decode_negtoken_exit;
-		cFYI(1, "Error decoding last part negTokenInit exit3");
-		return 0;
-	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-		/* tag = 3 indicating mechListMIC */
-		cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-		return 0;
-	}
-
-	/* sequence */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding last part negTokenInit exit5");
-		return 0;
-	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-		   || (tag != ASN1_SEQ)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-	}
-
-	/* sequence of */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding last part negTokenInit exit 7");
-		return 0;
-	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-		cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-		return 0;
-	}
-
-	/* general string */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding last part negTokenInit exit9");
-		return 0;
-	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
-		   || (tag != ASN1_GENSTR)) {
-		cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-		return 0;
-	}
-	cFYI(1, "Need to call asn1_octets_decode() function for %s",
-		ctx.pointer);	/* is this UTF-8 or ASCII? */
-decode_negtoken_exit:
+	/*
+	 * We currently ignore anything at the end of the SPNEGO blob after
+	 * the mechTypes have been parsed, since none of that info is
+	 * used at the moment.
+	 */
 	return 1;
 }
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index e654dfd092c..6c665bf4a27 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -50,7 +50,7 @@ void cifs_fscache_unregister(void)
  */
 struct cifs_server_key {
 	uint16_t	family;		/* address family */
-	uint16_t	port;		/* IP port */
+	__be16		port;		/* IP port */
 	union {
 		struct in_addr	ipv4_addr;
 		struct in6_addr	ipv6_addr;
@@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
 		break;
 
 	default:
-		cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
+		cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
 		key_len = 0;
 		break;
 	}
@@ -146,13 +146,13 @@ static char *extract_sharename(const char *treename)
 static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
 				   uint16_t maxbuf)
 {
-	const struct cifsTconInfo *tcon = cookie_netfs_data;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
 	char *sharename;
 	uint16_t len;
 
 	sharename = extract_sharename(tcon->treeName);
 	if (IS_ERR(sharename)) {
-		cFYI(1, "CIFS: couldn't extract sharename\n");
+		cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
 		sharename = NULL;
 		return 0;
 	}
@@ -173,7 +173,7 @@ cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
 			   uint16_t maxbuf)
 {
 	struct cifs_fscache_super_auxdata auxdata;
-	const struct cifsTconInfo *tcon = cookie_netfs_data;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
 
 	memset(&auxdata, 0, sizeof(auxdata));
 	auxdata.resource_id = tcon->resource_id;
@@ -192,7 +192,7 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
 					      uint16_t datalen)
 {
 	struct cifs_fscache_super_auxdata auxdata;
-	const struct cifsTconInfo *tcon = cookie_netfs_data;
+	const struct cifs_tcon *tcon = cookie_netfs_data;
 
 	if (datalen != sizeof(auxdata))
 		return FSCACHE_CHECKAUX_OBSOLETE;
@@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
 	pagevec_init(&pvec, 0);
 	first = 0;
 
-	cFYI(1, "cifs inode 0x%p now uncached", cifsi);
+	cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
 
 	for (;;) {
 		nr_pages = pagevec_lookup(&pvec,
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 65829d32128..f3ac4154cbb 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -57,43 +57,63 @@ cifs_dump_mem(char *label, void *data, int length)
 	}
 }
 
-#ifdef CONFIG_CIFS_DEBUG2
-void cifs_dump_detail(struct smb_hdr *smb)
+#ifdef CONFIG_CIFS_DEBUG
+void cifs_vfs_err(const char *fmt, ...)
 {
-	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
-		  smb->Command, smb->Status.CifsError,
-		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-	cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_ERR "CIFS VFS: %pV", &vaf);
+
+	va_end(args);
 }
+#endif
+
+void cifs_dump_detail(void *buf)
+{
+#ifdef CONFIG_CIFS_DEBUG2
+	struct smb_hdr *smb = (struct smb_hdr *)buf;
 
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
+		 smb->Command, smb->Status.CifsError,
+		 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
+#endif /* CONFIG_CIFS_DEBUG2 */
+}
 
 void cifs_dump_mids(struct TCP_Server_Info *server)
 {
+#ifdef CONFIG_CIFS_DEBUG2
 	struct list_head *tmp;
 	struct mid_q_entry *mid_entry;
 
 	if (server == NULL)
 		return;
 
-	cERROR(1, "Dump pending requests:");
+	cifs_dbg(VFS, "Dump pending requests:\n");
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
-			mid_entry->midState,
-			(int)mid_entry->command,
-			mid_entry->pid,
-			mid_entry->callback_data,
-			mid_entry->mid);
+		cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
+			 mid_entry->mid_state,
+			 le16_to_cpu(mid_entry->command),
+			 mid_entry->pid,
+			 mid_entry->callback_data,
+			 mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
-			mid_entry->largeBuf,
-			mid_entry->resp_buf,
-			mid_entry->when_received,
-			jiffies);
+		cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n",
+			 mid_entry->large_buf,
+			 mid_entry->resp_buf,
+			 mid_entry->when_received,
+			 jiffies);
 #endif /* STATS2 */
-		cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-			  mid_entry->multiEnd);
+		cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
+			 mid_entry->multiRsp, mid_entry->multiEnd);
 		if (mid_entry->resp_buf) {
 			cifs_dump_detail(mid_entry->resp_buf);
 			cifs_dump_mem("existing buf: ",
@@ -101,8 +121,8 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 		}
 	}
 	spin_unlock(&GlobalMid_Lock);
-}
 #endif /* CONFIG_CIFS_DEBUG2 */
+}
 
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
@@ -110,8 +130,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	struct list_head *tmp1, *tmp2, *tmp3;
 	struct mid_q_entry *mid_entry;
 	struct TCP_Server_Info *server;
-	struct cifsSesInfo *ses;
-	struct cifsTconInfo *tcon;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
 	int i, j;
 	__u32 dev_type;
 
@@ -152,7 +172,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				    tcp_ses_list);
 		i++;
 		list_for_each(tmp2, &server->smb_ses_list) {
-			ses = list_entry(tmp2, struct cifsSesInfo,
+			ses = list_entry(tmp2, struct cifs_ses,
 					 smb_ses_list);
 			if ((ses->serverDomain == NULL) ||
 				(ses->serverOS == NULL) ||
@@ -171,19 +191,18 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			seq_printf(m, "TCP status: %d\n\tLocal Users To "
 				   "Server: %d SecMode: 0x%x Req On Wire: %d",
 				   server->tcpStatus, server->srv_count,
-				   server->secMode,
-				   atomic_read(&server->inFlight));
+				   server->sec_mode, in_flight(server));
 
 #ifdef CONFIG_CIFS_STATS2
 			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-				atomic_read(&server->inSend),
+				atomic_read(&server->in_send),
 				atomic_read(&server->num_waiters));
 #endif
 
 			seq_puts(m, "\n\tShares:");
 			j = 0;
 			list_for_each(tmp3, &ses->tcon_list) {
-				tcon = list_entry(tmp3, struct cifsTconInfo,
+				tcon = list_entry(tmp3, struct cifs_tcon,
 						  tcon_list);
 				++j;
 				dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
@@ -194,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 						   tcon->nativeFileSystem);
 				}
 				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-					"\nPathComponentMax: %d Status: 0x%d",
+					"\n\tPathComponentMax: %d Status: 0x%d",
 					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
 					le32_to_cpu(tcon->fsAttrInfo.Attributes),
 					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -205,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 					seq_puts(m, " type: CDROM ");
 				else
 					seq_printf(m, " type: %d ", dev_type);
+				if (server->ops->dump_share_caps)
+					server->ops->dump_share_caps(m, tcon);
 
 				if (tcon->need_reconnect)
 					seq_puts(m, "\tDISCONNECTED ");
@@ -218,12 +239,12 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				mid_entry = list_entry(tmp3, struct mid_q_entry,
 					qhead);
 				seq_printf(m, "\tState: %d com: %d pid:"
-						" %d cbdata: %p mid %d\n",
-						mid_entry->midState,
-						(int)mid_entry->command,
-						mid_entry->pid,
-						mid_entry->callback_data,
-						mid_entry->mid);
+					      " %d cbdata: %p mid %llu\n",
+					      mid_entry->mid_state,
+					      le16_to_cpu(mid_entry->command),
+					      mid_entry->pid,
+					      mid_entry->callback_data,
+					      mid_entry->mid);
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
@@ -256,8 +277,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 	int rc;
 	struct list_head *tmp1, *tmp2, *tmp3;
 	struct TCP_Server_Info *server;
-	struct cifsSesInfo *ses;
-	struct cifsTconInfo *tcon;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
 
 	rc = get_user(c, buffer);
 	if (rc)
@@ -273,31 +294,15 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 			server = list_entry(tmp1, struct TCP_Server_Info,
 					    tcp_ses_list);
 			list_for_each(tmp2, &server->smb_ses_list) {
-				ses = list_entry(tmp2, struct cifsSesInfo,
+				ses = list_entry(tmp2, struct cifs_ses,
 						 smb_ses_list);
 				list_for_each(tmp3, &ses->tcon_list) {
 					tcon = list_entry(tmp3,
-							  struct cifsTconInfo,
+							  struct cifs_tcon,
 							  tcon_list);
 					atomic_set(&tcon->num_smbs_sent, 0);
-					atomic_set(&tcon->num_writes, 0);
-					atomic_set(&tcon->num_reads, 0);
-					atomic_set(&tcon->num_oplock_brks, 0);
-					atomic_set(&tcon->num_opens, 0);
-					atomic_set(&tcon->num_posixopens, 0);
-					atomic_set(&tcon->num_posixmkdirs, 0);
-					atomic_set(&tcon->num_closes, 0);
-					atomic_set(&tcon->num_deletes, 0);
-					atomic_set(&tcon->num_mkdirs, 0);
-					atomic_set(&tcon->num_rmdirs, 0);
-					atomic_set(&tcon->num_renames, 0);
-					atomic_set(&tcon->num_t2renames, 0);
-					atomic_set(&tcon->num_ffirst, 0);
-					atomic_set(&tcon->num_fnext, 0);
-					atomic_set(&tcon->num_fclose, 0);
-					atomic_set(&tcon->num_hardlinks, 0);
-					atomic_set(&tcon->num_symlinks, 0);
-					atomic_set(&tcon->num_locks, 0);
+					if (server->ops->clear_stats)
+						server->ops->clear_stats(tcon);
 				}
 			}
 		}
@@ -312,8 +317,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 	int i;
 	struct list_head *tmp1, *tmp2, *tmp3;
 	struct TCP_Server_Info *server;
-	struct cifsSesInfo *ses;
-	struct cifsTconInfo *tcon;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
 
 	seq_printf(m,
 			"Resources in use\nCIFS Session: %d\n",
@@ -346,52 +351,20 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
 		list_for_each(tmp2, &server->smb_ses_list) {
-			ses = list_entry(tmp2, struct cifsSesInfo,
+			ses = list_entry(tmp2, struct cifs_ses,
 					 smb_ses_list);
 			list_for_each(tmp3, &ses->tcon_list) {
 				tcon = list_entry(tmp3,
-						  struct cifsTconInfo,
+						  struct cifs_tcon,
 						  tcon_list);
 				i++;
 				seq_printf(m, "\n%d) %s", i, tcon->treeName);
 				if (tcon->need_reconnect)
 					seq_puts(m, "\tDISCONNECTED ");
-				seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
-					atomic_read(&tcon->num_smbs_sent),
-					atomic_read(&tcon->num_oplock_brks));
-				seq_printf(m, "\nReads:  %d Bytes: %lld",
-					atomic_read(&tcon->num_reads),
-					(long long)(tcon->bytes_read));
-				seq_printf(m, "\nWrites: %d Bytes: %lld",
-					atomic_read(&tcon->num_writes),
-					(long long)(tcon->bytes_written));
-				seq_printf(m, "\nFlushes: %d",
-					atomic_read(&tcon->num_flushes));
-				seq_printf(m, "\nLocks: %d HardLinks: %d "
-					      "Symlinks: %d",
-					atomic_read(&tcon->num_locks),
-					atomic_read(&tcon->num_hardlinks),
-					atomic_read(&tcon->num_symlinks));
-				seq_printf(m, "\nOpens: %d Closes: %d "
-					      "Deletes: %d",
-					atomic_read(&tcon->num_opens),
-					atomic_read(&tcon->num_closes),
-					atomic_read(&tcon->num_deletes));
-				seq_printf(m, "\nPosix Opens: %d "
-					      "Posix Mkdirs: %d",
-					atomic_read(&tcon->num_posixopens),
-					atomic_read(&tcon->num_posixmkdirs));
-				seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
-					atomic_read(&tcon->num_mkdirs),
-					atomic_read(&tcon->num_rmdirs));
-				seq_printf(m, "\nRenames: %d T2 Renames %d",
-					atomic_read(&tcon->num_renames),
-					atomic_read(&tcon->num_t2renames));
-				seq_printf(m, "\nFindFirst: %d FNext %d "
-					      "FClose %d",
-					atomic_read(&tcon->num_ffirst),
-					atomic_read(&tcon->num_fnext),
-					atomic_read(&tcon->num_fclose));
+				seq_printf(m, "\nSMBs: %d",
+					   atomic_read(&tcon->num_smbs_sent));
+				if (server->ops->print_stats)
+					server->ops->print_stats(m, tcon);
 			}
 		}
 	}
@@ -418,12 +391,9 @@ static const struct file_operations cifs_stats_proc_fops = {
 
 static struct proc_dir_entry *proc_fs_cifs;
 static const struct file_operations cifsFYI_proc_fops;
-static const struct file_operations cifs_oplock_proc_fops;
 static const struct file_operations cifs_lookup_cache_proc_fops;
 static const struct file_operations traceSMB_proc_fops;
-static const struct file_operations cifs_multiuser_mount_proc_fops;
 static const struct file_operations cifs_security_flags_proc_fops;
-static const struct file_operations cifs_experimental_proc_fops;
 static const struct file_operations cifs_linux_ext_proc_fops;
 
 void
@@ -440,13 +410,8 @@ cifs_proc_init(void)
 #endif /* STATS */
 	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
 	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
-	proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
-	proc_create("Experimental", 0, proc_fs_cifs,
-		    &cifs_experimental_proc_fops);
 	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
 		    &cifs_linux_ext_proc_fops);
-	proc_create("MultiuserMount", 0, proc_fs_cifs,
-		    &cifs_multiuser_mount_proc_fops);
 	proc_create("SecurityFlags", 0, proc_fs_cifs,
 		    &cifs_security_flags_proc_fops);
 	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
@@ -465,11 +430,8 @@ cifs_proc_clean(void)
 #ifdef CONFIG_CIFS_STATS
 	remove_proc_entry("Stats", proc_fs_cifs);
 #endif
-	remove_proc_entry("MultiuserMount", proc_fs_cifs);
-	remove_proc_entry("OplockEnabled", proc_fs_cifs);
 	remove_proc_entry("SecurityFlags", proc_fs_cifs);
 	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
-	remove_proc_entry("Experimental", proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
 	remove_proc_entry("fs/cifs", NULL);
 }
@@ -513,82 +475,6 @@ static const struct file_operations cifsFYI_proc_fops = {
 	.write		= cifsFYI_proc_write,
 };
 
-static int cifs_oplock_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%d\n", oplockEnabled);
-	return 0;
-}
-
-static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cifs_oplock_proc_show, NULL);
-}
-
-static ssize_t cifs_oplock_proc_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos)
-{
-	char c;
-	int rc;
-
-	rc = get_user(c, buffer);
-	if (rc)
-		return rc;
-	if (c == '0' || c == 'n' || c == 'N')
-		oplockEnabled = 0;
-	else if (c == '1' || c == 'y' || c == 'Y')
-		oplockEnabled = 1;
-
-	return count;
-}
-
-static const struct file_operations cifs_oplock_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cifs_oplock_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= cifs_oplock_proc_write,
-};
-
-static int cifs_experimental_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%d\n", experimEnabled);
-	return 0;
-}
-
-static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cifs_experimental_proc_show, NULL);
-}
-
-static ssize_t cifs_experimental_proc_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos)
-{
-	char c;
-	int rc;
-
-	rc = get_user(c, buffer);
-	if (rc)
-		return rc;
-	if (c == '0' || c == 'n' || c == 'N')
-		experimEnabled = 0;
-	else if (c == '1' || c == 'y' || c == 'Y')
-		experimEnabled = 1;
-	else if (c == '2')
-		experimEnabled = 2;
-
-	return count;
-}
-
-static const struct file_operations cifs_experimental_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cifs_experimental_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= cifs_experimental_proc_write,
-};
-
 static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%d\n", linuxExtEnabled);
@@ -700,57 +586,47 @@ static const struct file_operations traceSMB_proc_fops = {
 	.write		= traceSMB_proc_write,
 };
 
-static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "%d\n", multiuser_mount);
+	seq_printf(m, "0x%x\n", global_secflags);
 	return 0;
 }
 
-static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
+	return single_open(file, cifs_security_flags_proc_show, NULL);
 }
 
-static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos)
+/*
+ * Ensure that if someone sets a MUST flag, that we disable all other MAY
+ * flags except for the ones corresponding to the given MUST flag. If there are
+ * multiple MUST flags, then try to prefer more secure ones.
+ */
+static void
+cifs_security_flags_handle_must_flags(unsigned int *flags)
 {
-	char c;
-	int rc;
+	unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
 
-	rc = get_user(c, buffer);
-	if (rc)
-		return rc;
-	if (c == '0' || c == 'n' || c == 'N')
-		multiuser_mount = 0;
-	else if (c == '1' || c == 'y' || c == 'Y')
-		multiuser_mount = 1;
-
-	return count;
-}
-
-static const struct file_operations cifs_multiuser_mount_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cifs_multiuser_mount_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= cifs_multiuser_mount_proc_write,
-};
-
-static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "0x%x\n", global_secflags);
-	return 0;
-}
+	if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+		*flags = CIFSSEC_MUST_KRB5;
+	else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+		*flags = CIFSSEC_MUST_NTLMSSP;
+	else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+		*flags = CIFSSEC_MUST_NTLMV2;
+	else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
+		*flags = CIFSSEC_MUST_NTLM;
+	else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+		*flags = CIFSSEC_MUST_LANMAN;
+	else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+		*flags = CIFSSEC_MUST_PLNTXT;
 
-static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, cifs_security_flags_proc_show, NULL);
+	*flags |= signflags;
 }
 
 static ssize_t cifs_security_flags_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos)
 {
+	int rc;
 	unsigned int flags;
 	char flags_string[12];
 	char c;
@@ -773,34 +649,43 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 			global_secflags = CIFSSEC_MAX;
 			return count;
 		} else if (!isdigit(c)) {
-			cERROR(1, "invalid flag %c", c);
+			cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+					flags_string);
 			return -EINVAL;
 		}
 	}
-	/* else we have a number */
 
-	flags = simple_strtoul(flags_string, NULL, 0);
+	/* else we have a number */
+	rc = kstrtouint(flags_string, 0, &flags);
+	if (rc) {
+		cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+				flags_string);
+		return rc;
+	}
 
-	cFYI(1, "sec flags 0x%x", flags);
+	cifs_dbg(FYI, "sec flags 0x%x\n", flags);
 
-	if (flags <= 0)  {
-		cERROR(1, "invalid security flags %s", flags_string);
+	if (flags == 0)  {
+		cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
 		return -EINVAL;
 	}
 
 	if (flags & ~CIFSSEC_MASK) {
-		cERROR(1, "attempt to set unsupported security flags 0x%x",
-			flags & ~CIFSSEC_MASK);
+		cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
+			 flags & ~CIFSSEC_MASK);
 		return -EINVAL;
 	}
+
+	cifs_security_flags_handle_must_flags(&flags);
+
 	/* flags look ok - update the global security flags for cifs module */
 	global_secflags = flags;
 	if (global_secflags & CIFSSEC_MUST_SIGN) {
 		/* requiring signing implies signing is allowed */
 		global_secflags |= CIFSSEC_MAY_SIGN;
-		cFYI(1, "packet signing now required");
+		cifs_dbg(FYI, "packet signing now required\n");
 	} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
-		cFYI(1, "packet signing disabled");
+		cifs_dbg(FYI, "packet signing disabled\n");
 	}
 	/* BB should we turn on MAY flags for other MUST options? */
 	return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 8942b28cf80..c99b40fb609 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,68 +18,49 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  *
 */
-#define CIFS_DEBUG		/* BB temporary */
 
 #ifndef _H_CIFS_DEBUG
 #define _H_CIFS_DEBUG
 
 void cifs_dump_mem(char *label, void *data, int length);
-#ifdef CONFIG_CIFS_DEBUG2
-#define DBG2 2
-void cifs_dump_detail(struct smb_hdr *);
+void cifs_dump_detail(void *);
 void cifs_dump_mids(struct TCP_Server_Info *);
-#else
-#define DBG2 0
-#endif
 extern int traceSMB;		/* flag which enables the function below */
-void dump_smb(struct smb_hdr *, int);
+void dump_smb(void *, int);
 #define CIFS_INFO	0x01
 #define CIFS_RC		0x02
 #define CIFS_TIMER	0x04
 
+#define VFS 1
+#define FYI 2
+extern int cifsFYI;
+#ifdef CONFIG_CIFS_DEBUG2
+#define NOISY 4
+#else
+#define NOISY 0
+#endif
+
 /*
  *	debug ON
  *	--------
  */
-#ifdef CIFS_DEBUG
-
-/* information message: e.g., configuration, major event */
-extern int cifsFYI;
-#define cifsfyi(fmt, arg...)						\
-do {									\
-	if (cifsFYI & CIFS_INFO)					\
-		printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);	\
-} while (0)
-
-#define cFYI(set, fmt, arg...)			\
-do {						\
-	if (set)				\
-		cifsfyi(fmt, ##arg);		\
-} while (0)
+#ifdef CONFIG_CIFS_DEBUG
 
-#define cifswarn(fmt, arg...)			\
-	printk(KERN_WARNING fmt "\n", ##arg)
+__printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
 
-/* debug event message: */
-extern int cifsERROR;
-
-#define cEVENT(fmt, arg...)						\
+/* information message: e.g., configuration, major event */
+#define cifs_dbg(type, fmt, ...)					\
 do {									\
-	if (cifsERROR)							\
-		printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);	\
-} while (0)
-
-/* error event message: e.g., i/o error */
-#define cifserror(fmt, arg...)					\
-do {								\
-	if (cifsERROR)						\
-		printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);	\
-} while (0)
-
-#define cERROR(set, fmt, arg...)		\
-do {						\
-	if (set)				\
-		cifserror(fmt, ##arg);		\
+	if (type == FYI) {						\
+		if (cifsFYI & CIFS_INFO) {				\
+			printk(KERN_DEBUG "%s: " fmt,			\
+			       __FILE__, ##__VA_ARGS__);		\
+		}							\
+	} else if (type == VFS) {					\
+		cifs_vfs_err(fmt, ##__VA_ARGS__);			\
+	} else if (type == NOISY && type != 0) {			\
+		printk(KERN_DEBUG fmt, ##__VA_ARGS__);			\
+	}								\
 } while (0)
 
 /*
@@ -87,10 +68,11 @@ do {						\
  *	---------
  */
 #else		/* _CIFS_DEBUG */
-#define cERROR(set, fmt, arg...)
-#define cEVENT(fmt, arg...)
-#define cFYI(set, fmt, arg...)
-#define cifserror(fmt, arg...)
-#endif		/* _CIFS_DEBUG */
+#define cifs_dbg(type, fmt, ...)					\
+do {									\
+	if (0)								\
+		printk(KERN_DEBUG fmt, ##__VA_ARGS__);			\
+} while (0)
+#endif
 
 #endif				/* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 0a265ad9e42..58df174deb1 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifsfs.h"
@@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void)
 }
 
 /**
- * cifs_get_share_name	-	extracts share name from UNC
- * @node_name:	pointer to UNC string
+ * cifs_build_devname - build a devicename from a UNC and optional prepath
+ * @nodename:	pointer to UNC string
+ * @prepath:	pointer to prefixpath (or NULL if there isn't one)
  *
- * Extracts sharename form full UNC.
- * i.e. strips from UNC trailing path that is not part of share
- * name and fixup missing '\' in the begining of DFS node refferal
- * if necessary.
- * Returns pointer to share name on success or ERR_PTR on error.
- * Caller is responsible for freeing returned string.
+ * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer
+ * big enough to hold the final thing. Copy the UNC from the nodename, and
+ * concatenate the prepath onto the end of it if there is one.
+ *
+ * Returns pointer to the built string, or a ERR_PTR. Caller is responsible
+ * for freeing the returned string.
  */
-static char *cifs_get_share_name(const char *node_name)
+static char *
+cifs_build_devname(char *nodename, const char *prepath)
 {
-	int len;
-	char *UNC;
-	char *pSep;
-
-	len = strlen(node_name);
-	UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
-			 GFP_KERNEL);
-	if (!UNC)
-		return ERR_PTR(-ENOMEM);
+	size_t pplen;
+	size_t unclen;
+	char *dev;
+	char *pos;
+
+	/* skip over any preceding delimiters */
+	nodename += strspn(nodename, "\\");
+	if (!*nodename)
+		return ERR_PTR(-EINVAL);
 
-	/* get share name and server name */
-	if (node_name[1] != '\\') {
-		UNC[0] = '\\';
-		strncpy(UNC+1, node_name, len);
-		len++;
-		UNC[len] = 0;
-	} else {
-		strncpy(UNC, node_name, len);
-		UNC[len] = 0;
-	}
+	/* get length of UNC and set pos to last char */
+	unclen = strlen(nodename);
+	pos = nodename + unclen - 1;
 
-	/* find server name end */
-	pSep = memchr(UNC+2, '\\', len-2);
-	if (!pSep) {
-		cERROR(1, "%s: no server name end in node name: %s",
-			__func__, node_name);
-		kfree(UNC);
-		return ERR_PTR(-EINVAL);
+	/* trim off any trailing delimiters */
+	while (*pos == '\\') {
+		--pos;
+		--unclen;
 	}
 
-	/* find sharename end */
-	pSep++;
-	pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
-	if (pSep) {
-		/* trim path up to sharename end
-		 * now we have share name in UNC */
-		*pSep = 0;
+	/* allocate a buffer:
+	 * +2 for preceding "//"
+	 * +1 for delimiter between UNC and prepath
+	 * +1 for trailing NULL
+	 */
+	pplen = prepath ? strlen(prepath) : 0;
+	dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	pos = dev;
+	/* add the initial "//" */
+	*pos = '/';
+	++pos;
+	*pos = '/';
+	++pos;
+
+	/* copy in the UNC portion from referral */
+	memcpy(pos, nodename, unclen);
+	pos += unclen;
+
+	/* copy the prefixpath remainder (if there is one) */
+	if (pplen) {
+		*pos = '/';
+		++pos;
+		memcpy(pos, prepath, pplen);
+		pos += pplen;
 	}
 
-	return UNC;
+	/* NULL terminator */
+	*pos = '\0';
+
+	convert_delimiter(dev, '/');
+	return dev;
 }
 
 
@@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 {
 	int rc;
 	char *mountdata = NULL;
+	const char *prepath = NULL;
 	int md_len;
 	char *tkn_e;
 	char *srvIP = NULL;
@@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
 
-	*devname = cifs_get_share_name(ref->node_name);
+	if (strlen(fullpath) - ref->path_consumed)
+		prepath = fullpath + ref->path_consumed;
+
+	*devname = cifs_build_devname(ref->node_name, prepath);
 	if (IS_ERR(*devname)) {
 		rc = PTR_ERR(*devname);
 		*devname = NULL;
@@ -141,16 +162,19 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc < 0) {
-		cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
-			  __func__, *devname, rc);
+		cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
+			 __func__, *devname, rc);
 		goto compose_mount_options_err;
 	}
-	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
-	 * assuming that we have 'unc=' and 'ip=' in
-	 * the original sb_mountdata
+
+	/*
+	 * In most cases, we'll be building a shorter string than the original,
+	 * but we do have to assume that the address in the ip= option may be
+	 * much longer than the original. Add the max length of an address
+	 * string to the length of the original string to allow for worst case.
 	 */
-	md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
-	mountdata = kzalloc(md_len+1, GFP_KERNEL);
+	md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
+	mountdata = kzalloc(md_len + 1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		rc = -ENOMEM;
 		goto compose_mount_options_err;
@@ -194,29 +218,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 		strncat(mountdata, &sep, 1);
 	strcat(mountdata, "ip=");
 	strcat(mountdata, srvIP);
-	strncat(mountdata, &sep, 1);
-	strcat(mountdata, "unc=");
-	strcat(mountdata, *devname);
-
-	/* find & copy prefixpath */
-	tkn_e = strchr(ref->node_name + 2, '\\');
-	if (tkn_e == NULL) {
-		/* invalid unc, missing share name*/
-		rc = -EINVAL;
-		goto compose_mount_options_err;
-	}
-
-	tkn_e = strchr(tkn_e + 1, '\\');
-	if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
-		strncat(mountdata, &sep, 1);
-		strcat(mountdata, "prefixpath=");
-		if (tkn_e)
-			strcat(mountdata, tkn_e + 1);
-		strcat(mountdata, fullpath + ref->path_consumed);
-	}
 
-	/*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
-	/*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
+	/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
+	/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
 
 compose_mount_options_out:
 	kfree(srvIP);
@@ -225,6 +229,8 @@ compose_mount_options_out:
 compose_mount_options_err:
 	kfree(mountdata);
 	mountdata = ERR_PTR(rc);
+	kfree(*devname);
+	*devname = NULL;
 	goto compose_mount_options_out;
 }
 
@@ -257,11 +263,12 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-	cFYI(1, "DFS: ref path: %s", ref->path_name);
-	cFYI(1, "DFS: node path: %s", ref->node_name);
-	cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
-	cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-				ref->path_consumed);
+	cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
+	cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
+	cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+		 ref->flags, ref->server_type);
+	cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+		 ref->ref_flag, ref->path_consumed);
 }
 
 /*
@@ -272,14 +279,15 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals = 0;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsSesInfo *ses;
+	struct cifs_ses *ses;
 	char *full_path;
-	int xid, i;
+	unsigned int xid;
+	int i;
 	int rc;
 	struct vfsmount *mnt;
 	struct tcon_link *tlink;
 
-	cFYI(1, "in %s", __func__);
+	cifs_dbg(FYI, "in %s\n", __func__);
 	BUG_ON(IS_ROOT(mntpt));
 
 	/*
@@ -301,11 +309,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	}
 	ses = tlink_tcon(tlink)->ses;
 
-	xid = GetXid();
+	xid = get_xid();
 	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
 		&num_referrals, &referrals,
 		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	FreeXid(xid);
+	free_xid(xid);
 
 	cifs_put_tlink(tlink);
 
@@ -316,15 +324,15 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 		/* connect to a node */
 		len = strlen(referrals[i].node_name);
 		if (len < 2) {
-			cERROR(1, "%s: Net Address path too short: %s",
-					__func__, referrals[i].node_name);
+			cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+				 __func__, referrals[i].node_name);
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
 		mnt = cifs_dfs_do_refmount(cifs_sb,
 				full_path, referrals + i);
-		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-					referrals[i].node_name, mnt);
+		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
+			 __func__, referrals[i].node_name, mnt);
 		if (!IS_ERR(mnt))
 			goto success;
 	}
@@ -339,7 +347,7 @@ success:
 free_full_path:
 	kfree(full_path);
 cdda_exit:
-	cFYI(1, "leaving %s" , __func__);
+	cifs_dbg(FYI, "leaving %s\n" , __func__);
 	return mnt;
 }
 
@@ -350,11 +358,11 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
 
-	cFYI(1, "in %s", __func__);
+	cifs_dbg(FYI, "in %s\n", __func__);
 
 	newmnt = cifs_dfs_do_automount(path->dentry);
 	if (IS_ERR(newmnt)) {
-		cFYI(1, "leaving %s [automount failed]" , __func__);
+		cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
 		return newmnt;
 	}
 
@@ -362,7 +370,7 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
 	schedule_delayed_work(&cifs_dfs_automount_task,
 			      cifs_dfs_mountpoint_expiry_timeout);
-	cFYI(1, "leaving %s [ok]" , __func__);
+	cifs_dbg(FYI, "leaving %s [ok]\n" , __func__);
 	return newmnt;
 }
 
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ac51cd2d33a..9409fa10bd5 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -41,6 +41,10 @@
 #define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
 #define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 #define CIFS_MOUNT_STRICT_IO	0x40000 /* strict cache mode */
+#define CIFS_MOUNT_RWPIDFORWARD	0x80000 /* use pid forwarding for rw */
+#define CIFS_MOUNT_POSIXACL	0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
+#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
 
 struct cifs_sb_info {
 	struct rb_root tlink_tree;
@@ -51,17 +55,16 @@ struct cifs_sb_info {
 	unsigned int wsize;
 	unsigned long actimeo; /* attribute cache timeout (jiffies) */
 	atomic_t active;
-	uid_t	mnt_uid;
-	gid_t	mnt_gid;
-	mode_t	mnt_file_mode;
-	mode_t	mnt_dir_mode;
+	kuid_t	mnt_uid;
+	kgid_t	mnt_gid;
+	kuid_t	mnt_backupuid;
+	kgid_t	mnt_backupgid;
+	umode_t	mnt_file_mode;
+	umode_t	mnt_dir_mode;
 	unsigned int mnt_cifs_flags;
-	int	prepathlen;
-	char   *prepath; /* relative path under the share to mount to */
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	char   *mountdata; /* mount options received at mount time */
-#endif
+	char   *mountdata; /* options received at mount time or via DFS refs */
 	struct backing_dev_info bdi;
 	struct delayed_work prune_tlinks;
+	struct rcu_head rcu;
 };
 #endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 4dfba828316..a3e93254761 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -31,18 +31,17 @@
 
 /* create a new cifs key */
 static int
-cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
+cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	char *payload;
 	int ret;
 
 	ret = -ENOMEM;
-	payload = kmalloc(datalen, GFP_KERNEL);
+	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
 	if (!payload)
 		goto error;
 
 	/* attach the data */
-	memcpy(payload, data, datalen);
 	key->payload.data = payload;
 	ret = 0;
 
@@ -95,7 +94,7 @@ struct key_type cifs_spnego_key_type = {
 
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
-cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
+cifs_get_spnego_key(struct cifs_ses *sesInfo)
 {
 	struct TCP_Server_Info *server = sesInfo->server;
 	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
@@ -113,9 +112,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
-		   USER_KEY_LEN + strlen(sesInfo->userName) +
 		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
 
+	if (sesInfo->user_name)
+		desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
 	if (description == NULL)
@@ -147,18 +148,22 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 		goto out;
 
 	dp = description + strlen(description);
-	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+	sprintf(dp, ";uid=0x%x",
+		from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
 
 	dp = description + strlen(description);
-	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+	sprintf(dp, ";creduid=0x%x",
+		from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
 
-	dp = description + strlen(description);
-	sprintf(dp, ";user=%s", sesInfo->userName);
+	if (sesInfo->user_name) {
+		dp = description + strlen(description);
+		sprintf(dp, ";user=%s", sesInfo->user_name);
+	}
 
 	dp = description + strlen(description);
 	sprintf(dp, ";pid=0x%x", current->pid);
 
-	cFYI(1, "key description = %s", description);
+	cifs_dbg(FYI, "key description = %s\n", description);
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index e4041ec4d71..31bef9ee078 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -41,7 +41,7 @@ struct cifs_spnego_msg {
 
 #ifdef __KERNEL__
 extern struct key_type cifs_spnego_key_type;
-extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
+extern struct key *cifs_get_spnego_key(struct cifs_ses *sesInfo);
 #endif /* KERNEL */
 
 #endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index fc0fd4fde30..15e9505aa35 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
 #include "cifs_debug.h"
 
 /*
- * cifs_ucs2_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
  * @maxbytes - don't go past this many bytes of input string
  * @codepage - destination codepage
  *
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
  * be after being converted to the given charset, not including any null
  * termination required. Don't walk past maxbytes in the source buffer.
  */
 int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
 		const struct nls_table *codepage)
 {
 	int i;
@@ -90,7 +90,7 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
 	case UNI_COLON:
 		*target = ':';
 		break;
-	case UNI_ASTERIK:
+	case UNI_ASTERISK:
 		*target = '*';
 		break;
 	case UNI_QUESTION:
@@ -122,7 +122,7 @@ cp_convert:
 }
 
 /*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
  * @to - destination buffer
  * @from - source buffer
  * @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
  * @codepage - codepage to which characters should be converted
  * @mapchar - should characters be remapped according to the mapchars option?
  *
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
  * in the provided codepage. The tolen and fromlen parameters are to ensure
  * that the code doesn't walk off of the end of the buffer (which is always
  * a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
  * null terminator).
  *
  * Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
  * deal with those characters properly. In the event that we get some of
  * those characters, they won't be translated properly.
  */
 int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 		 const struct nls_table *codepage, bool mapchar)
 {
 	int i, charlen, safelen;
@@ -190,24 +190,45 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 }
 
 /*
- * NAME:	cifs_strtoUCS()
+ * NAME:	cifs_strtoUTF16()
  *
  * FUNCTION:	Convert character string to unicode string
  *
  */
 int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
 	      const struct nls_table *codepage)
 {
 	int charlen;
 	int i;
 	wchar_t wchar_to; /* needed to quiet sparse */
 
+	/* special case for utf8 to handle no plane0 chars */
+	if (!strcmp(codepage->charset, "utf8")) {
+		/*
+		 * convert utf8 -> utf16, we assume we have enough space
+		 * as caller should have assumed conversion does not overflow
+		 * in destination len is length in wchar_t units (16bits)
+		 */
+		i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
+				       (wchar_t *) to, len);
+
+		/* if success terminate and exit */
+		if (i >= 0)
+			goto success;
+		/*
+		 * if fails fall back to UCS encoding as this
+		 * function should not return negative values
+		 * currently can fail only if source contains
+		 * invalid encoded characters
+		 */
+	}
+
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
-				*from, charlen);
+			cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
+				 *from, charlen);
 			/* A question mark */
 			wchar_to = 0x003f;
 			charlen = 1;
@@ -215,12 +236,14 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 		put_unaligned_le16(wchar_to, &to[i]);
 	}
 
+success:
 	put_unaligned_le16(0, &to[i]);
 	return i;
 }
 
 /*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
  * @src - source string
  * @maxlen - don't walk past this many bytes in the source string
  * @is_unicode - is this a unicode string?
@@ -231,19 +254,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
  * error.
  */
 char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
-	     const struct nls_table *codepage)
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+			const bool is_unicode, const struct nls_table *codepage)
 {
 	int len;
 	char *dst;
 
 	if (is_unicode) {
-		len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
 		len += nls_nullsize(codepage);
 		dst = kmalloc(len, GFP_KERNEL);
 		if (!dst)
 			return NULL;
-		cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
 			       false);
 	} else {
 		len = strnlen(src, maxlen);
@@ -264,40 +287,41 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
  * names are little endian 16 bit Unicode on the wire
  */
 int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
 		 const struct nls_table *cp, int mapChars)
 {
-	int i, j, charlen;
-	int len_remaining = maxlen;
+	int i, charlen;
+	int j = 0;
 	char src_char;
-	__u16 temp;
+	__le16 dst_char;
+	wchar_t tmp;
 
 	if (!mapChars)
-		return cifs_strtoUCS(target, source, PATH_MAX, cp);
+		return cifs_strtoUTF16(target, source, PATH_MAX, cp);
 
-	for (i = 0, j = 0; i < maxlen; j++) {
+	for (i = 0; i < srclen; j++) {
 		src_char = source[i];
+		charlen = 1;
 		switch (src_char) {
 		case 0:
-			put_unaligned_le16(0, &target[j]);
-			goto ctoUCS_out;
+			goto ctoUTF16_out;
 		case ':':
-			temp = UNI_COLON;
+			dst_char = cpu_to_le16(UNI_COLON);
 			break;
 		case '*':
-			temp = UNI_ASTERIK;
+			dst_char = cpu_to_le16(UNI_ASTERISK);
 			break;
 		case '?':
-			temp = UNI_QUESTION;
+			dst_char = cpu_to_le16(UNI_QUESTION);
 			break;
 		case '<':
-			temp = UNI_LESSTHAN;
+			dst_char = cpu_to_le16(UNI_LESSTHAN);
 			break;
 		case '>':
-			temp = UNI_GRTRTHAN;
+			dst_char = cpu_to_le16(UNI_GRTRTHAN);
 			break;
 		case '|':
-			temp = UNI_PIPE;
+			dst_char = cpu_to_le16(UNI_PIPE);
 			break;
 		/*
 		 * FIXME: We can not handle remapping backslash (UNI_SLASH)
@@ -305,31 +329,88 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
 		 * as they use backslash as separator.
 		 */
 		default:
-			charlen = cp->char2uni(source+i, len_remaining,
-						&temp);
+			charlen = cp->char2uni(source + i, srclen - i, &tmp);
+			dst_char = cpu_to_le16(tmp);
+
 			/*
 			 * if no match, use question mark, which at least in
 			 * some cases serves as wild card
 			 */
 			if (charlen < 1) {
-				temp = 0x003f;
+				dst_char = cpu_to_le16(0x003f);
 				charlen = 1;
 			}
-			len_remaining -= charlen;
-			/*
-			 * character may take more than one byte in the source
-			 * string, but will take exactly two bytes in the
-			 * target string
-			 */
-			i += charlen;
-			continue;
 		}
-		put_unaligned_le16(temp, &target[j]);
-		i++; /* move to next char in source string */
-		len_remaining--;
+		/*
+		 * character may take more than one byte in the source string,
+		 * but will take exactly two bytes in the target string
+		 */
+		i += charlen;
+		put_unaligned(dst_char, &target[j]);
 	}
 
-ctoUCS_out:
-	return i;
+ctoUTF16_out:
+	put_unaligned(0, &target[j]); /* Null terminate target unicode string */
+	return j;
+}
+
+#ifdef CONFIG_CIFS_SMB2
+/*
+ * cifs_local_to_utf16_bytes - how long will a string be after conversion?
+ * @from - pointer to input string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - source codepage
+ *
+ * Walk a string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ */
+
+static int
+cifs_local_to_utf16_bytes(const char *from, int len,
+			  const struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+	wchar_t wchar_to;
+
+	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+		charlen = codepage->char2uni(from, len, &wchar_to);
+		/* Failed conversion defaults to a question mark */
+		if (charlen < 1)
+			charlen = 1;
+	}
+	return 2 * i; /* UTF16 characters are two bytes */
 }
 
+/*
+ * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage
+ * @src - source string
+ * @maxlen - don't walk past this many bytes in the source string
+ * @utf16_len - the length of the allocated string in bytes (including null)
+ * @cp - source codepage
+ * @remap - map special chars
+ *
+ * Take a string convert it from the local codepage to UTF16 and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ */
+__le16 *
+cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
+		      const struct nls_table *cp, int remap)
+{
+	int len;
+	__le16 *dst;
+
+	len = cifs_local_to_utf16_bytes(src, maxlen, cp);
+	len += 2; /* NULL */
+	dst = kmalloc(len, GFP_KERNEL);
+	if (!dst) {
+		*utf16_len = 0;
+		return NULL;
+	}
+	cifsConvertToUTF16(dst, src, strlen(src), cp, remap);
+	*utf16_len = len;
+	return dst;
+}
+#endif /* CONFIG_CIFS_SMB2 */
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 7fe6b52df50..d8eac3b6cef 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -44,7 +44,7 @@
  * reserved symbols (along with \ and /), otherwise illegal to store
  * in filenames in NTFS
  */
-#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
+#define UNI_ASTERISK    (__u16) ('*' + 0xF000)
 #define UNI_QUESTION    (__u16) ('?' + 0xF000)
 #define UNI_COLON       (__u16) (':' + 0xF000)
 #define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
@@ -74,16 +74,25 @@ extern const struct UniCaseRange CifsUniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
-		   const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
-		    const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
-			    const bool is_unicode,
-			    const struct nls_table *codepage);
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+		    const struct nls_table *codepage, bool mapchar);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+		     const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+			      const bool is_unicode,
+			      const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+			      const struct nls_table *cp, int mapChars);
+#ifdef CONFIG_CIFS_SMB2
+extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
+				     int *utf16_len, const struct nls_table *cp,
+				     int remap);
+#endif /* CONFIG_CIFS_SMB2 */
 #endif
 
+wchar_t cifs_toupper(wchar_t in);
+
 /*
  * UniStrcat:  Concatenate the second string to the first
  *
@@ -320,14 +329,14 @@ UniToupper(register wchar_t uc)
 /*
  * UniStrupr:  Upper case a unicode string
  */
-static inline wchar_t *
-UniStrupr(register wchar_t *upin)
+static inline __le16 *
+UniStrupr(register __le16 *upin)
 {
-	register wchar_t *up;
+	register __le16 *up;
 
 	up = upin;
 	while (*up) {		/* For all characters */
-		*up = UniToupper(*up);
+		*up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
 		up++;
 	}
 	return upin;		/* Return input pointer */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index beeebf19423..7ff866dbb89 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -23,97 +23,147 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-
-static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
-	{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
-	{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
-;
-
-
 /* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
 	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
 /* security id for Authenticated Users system group */
 static const struct cifs_sid sid_authusers = {
-	1, 1, {0, 0, 0, 0, 0, 5}, {11} };
+	1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} };
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
 
+static const struct cred *root_cred;
 
-int match_sid(struct cifs_sid *ctsid)
+static int
+cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
-	int i, j;
-	int num_subauth, num_sat, num_saw;
-	struct cifs_sid *cwsid;
+	char *payload;
+
+	/*
+	 * If the payload is less than or equal to the size of a pointer, then
+	 * an allocation here is wasteful. Just copy the data directly to the
+	 * payload.value union member instead.
+	 *
+	 * With this however, you must check the datalen before trying to
+	 * dereference payload.data!
+	 */
+	if (prep->datalen <= sizeof(key->payload)) {
+		key->payload.value = 0;
+		memcpy(&key->payload.value, prep->data, prep->datalen);
+		key->datalen = prep->datalen;
+		return 0;
+	}
+	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+	if (!payload)
+		return -ENOMEM;
 
-	if (!ctsid)
-		return -1;
+	key->payload.data = payload;
+	key->datalen = prep->datalen;
+	return 0;
+}
 
-	for (i = 0; i < NUM_WK_SIDS; ++i) {
-		cwsid = &(wksidarr[i].cifssid);
+static inline void
+cifs_idmap_key_destroy(struct key *key)
+{
+	if (key->datalen > sizeof(key->payload))
+		kfree(key->payload.data);
+}
 
-		/* compare the revision */
-		if (ctsid->revision != cwsid->revision)
-			continue;
+static struct key_type cifs_idmap_key_type = {
+	.name        = "cifs.idmap",
+	.instantiate = cifs_idmap_key_instantiate,
+	.destroy     = cifs_idmap_key_destroy,
+	.describe    = user_describe,
+	.match       = user_match,
+};
 
-		/* compare all of the six auth values */
-		for (j = 0; j < 6; ++j) {
-			if (ctsid->authority[j] != cwsid->authority[j])
-				break;
-		}
-		if (j < 6)
-			continue; /* all of the auth values did not match */
-
-		/* compare all of the subauth values if any */
-		num_sat = ctsid->num_subauth;
-		num_saw = cwsid->num_subauth;
-		num_subauth = num_sat < num_saw ? num_sat : num_saw;
-		if (num_subauth) {
-			for (j = 0; j < num_subauth; ++j) {
-				if (ctsid->sub_auth[j] != cwsid->sub_auth[j])
-					break;
-			}
-			if (j < num_subauth)
-				continue; /* all sub_auth values do not match */
-		}
+static char *
+sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
+{
+	int i, len;
+	unsigned int saval;
+	char *sidstr, *strptr;
+	unsigned long long id_auth_val;
+
+	/* 3 bytes for prefix */
+	sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
+			 (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
+			 GFP_KERNEL);
+	if (!sidstr)
+		return sidstr;
+
+	strptr = sidstr;
+	len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
+			sidptr->revision);
+	strptr += len;
+
+	/* The authority field is a single 48-bit number */
+	id_auth_val = (unsigned long long)sidptr->authority[5];
+	id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
+	id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
+	id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
+	id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
+	id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
+
+	/*
+	 * MS-DTYP states that if the authority is >= 2^32, then it should be
+	 * expressed as a hex value.
+	 */
+	if (id_auth_val <= UINT_MAX)
+		len = sprintf(strptr, "-%llu", id_auth_val);
+	else
+		len = sprintf(strptr, "-0x%llx", id_auth_val);
+
+	strptr += len;
 
-		cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
-		return 0; /* sids compare/match */
+	for (i = 0; i < sidptr->num_subauth; ++i) {
+		saval = le32_to_cpu(sidptr->sub_auth[i]);
+		len = sprintf(strptr, "-%u", saval);
+		strptr += len;
 	}
 
-	cFYI(1, "No matching sid");
-	return -1;
+	return sidstr;
 }
 
-/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
-   the same returns 1, if they do not match returns 0 */
-int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
+/*
+ * if the two SIDs (roughly equivalent to a UUID for a user or group) are
+ * the same returns zero, if they do not match returns non-zero.
+ */
+static int
+compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 {
 	int i;
 	int num_subauth, num_sat, num_saw;
 
 	if ((!ctsid) || (!cwsid))
-		return 0;
+		return 1;
 
 	/* compare the revision */
-	if (ctsid->revision != cwsid->revision)
-		return 0;
+	if (ctsid->revision != cwsid->revision) {
+		if (ctsid->revision > cwsid->revision)
+			return 1;
+		else
+			return -1;
+	}
 
 	/* compare all of the six auth values */
-	for (i = 0; i < 6; ++i) {
-		if (ctsid->authority[i] != cwsid->authority[i])
-			return 0;
+	for (i = 0; i < NUM_AUTHS; ++i) {
+		if (ctsid->authority[i] != cwsid->authority[i]) {
+			if (ctsid->authority[i] > cwsid->authority[i])
+				return 1;
+			else
+				return -1;
+		}
 	}
 
 	/* compare all of the subauth values if any */
@@ -122,21 +172,236 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	num_subauth = num_sat < num_saw ? num_sat : num_saw;
 	if (num_subauth) {
 		for (i = 0; i < num_subauth; ++i) {
-			if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
-				return 0;
+			if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+				if (le32_to_cpu(ctsid->sub_auth[i]) >
+					le32_to_cpu(cwsid->sub_auth[i]))
+					return 1;
+				else
+					return -1;
+			}
 		}
 	}
 
-	return 1; /* sids compare/match */
+	return 0; /* sids compare/match */
+}
+
+static void
+cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
+{
+	int i;
+
+	dst->revision = src->revision;
+	dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
+	for (i = 0; i < NUM_AUTHS; ++i)
+		dst->authority[i] = src->authority[i];
+	for (i = 0; i < dst->num_subauth; ++i)
+		dst->sub_auth[i] = src->sub_auth[i];
 }
 
+static int
+id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
+{
+	int rc;
+	struct key *sidkey;
+	struct cifs_sid *ksid;
+	unsigned int ksid_size;
+	char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
+	const struct cred *saved_cred;
+
+	rc = snprintf(desc, sizeof(desc), "%ci:%u",
+			sidtype == SIDOWNER ? 'o' : 'g', cid);
+	if (rc >= sizeof(desc))
+		return -EINVAL;
+
+	rc = 0;
+	saved_cred = override_creds(root_cred);
+	sidkey = request_key(&cifs_idmap_key_type, desc, "");
+	if (IS_ERR(sidkey)) {
+		rc = -EINVAL;
+		cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n",
+			 __func__, sidtype == SIDOWNER ? 'u' : 'g', cid);
+		goto out_revert_creds;
+	} else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
+		rc = -EIO;
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+			 __func__, sidkey->datalen);
+		goto invalidate_key;
+	}
+
+	/*
+	 * A sid is usually too large to be embedded in payload.value, but if
+	 * there are no subauthorities and the host has 8-byte pointers, then
+	 * it could be.
+	 */
+	ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
+		(struct cifs_sid *)&sidkey->payload.value :
+		(struct cifs_sid *)sidkey->payload.data;
+
+	ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
+	if (ksid_size > sidkey->datalen) {
+		rc = -EIO;
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n",
+			 __func__, sidkey->datalen, ksid_size);
+		goto invalidate_key;
+	}
+
+	cifs_copy_sid(ssid, ksid);
+out_key_put:
+	key_put(sidkey);
+out_revert_creds:
+	revert_creds(saved_cred);
+	return rc;
+
+invalidate_key:
+	key_invalidate(sidkey);
+	goto out_key_put;
+}
+
+static int
+sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+		struct cifs_fattr *fattr, uint sidtype)
+{
+	int rc;
+	struct key *sidkey;
+	char *sidstr;
+	const struct cred *saved_cred;
+	kuid_t fuid = cifs_sb->mnt_uid;
+	kgid_t fgid = cifs_sb->mnt_gid;
+
+	/*
+	 * If we have too many subauthorities, then something is really wrong.
+	 * Just return an error.
+	 */
+	if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
+		cifs_dbg(FYI, "%s: %u subauthorities is too many!\n",
+			 __func__, psid->num_subauth);
+		return -EIO;
+	}
+
+	sidstr = sid_to_key_str(psid, sidtype);
+	if (!sidstr)
+		return -ENOMEM;
+
+	saved_cred = override_creds(root_cred);
+	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
+	if (IS_ERR(sidkey)) {
+		rc = -EINVAL;
+		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
+			 __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g');
+		goto out_revert_creds;
+	}
+
+	/*
+	 * FIXME: Here we assume that uid_t and gid_t are same size. It's
+	 * probably a safe assumption but might be better to check based on
+	 * sidtype.
+	 */
+	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+	if (sidkey->datalen != sizeof(uid_t)) {
+		rc = -EIO;
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+			 __func__, sidkey->datalen);
+		key_invalidate(sidkey);
+		goto out_key_put;
+	}
+
+	if (sidtype == SIDOWNER) {
+		kuid_t uid;
+		uid_t id;
+		memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+		uid = make_kuid(&init_user_ns, id);
+		if (uid_valid(uid))
+			fuid = uid;
+	} else {
+		kgid_t gid;
+		gid_t id;
+		memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+		gid = make_kgid(&init_user_ns, id);
+		if (gid_valid(gid))
+			fgid = gid;
+	}
+
+out_key_put:
+	key_put(sidkey);
+out_revert_creds:
+	revert_creds(saved_cred);
+	kfree(sidstr);
+
+	/*
+	 * Note that we return 0 here unconditionally. If the mapping
+	 * fails then we just fall back to using the mnt_uid/mnt_gid.
+	 */
+	if (sidtype == SIDOWNER)
+		fattr->cf_uid = fuid;
+	else
+		fattr->cf_gid = fgid;
+	return 0;
+}
+
+int
+init_cifs_idmap(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	cifs_dbg(FYI, "Registering the %s key type\n",
+		 cifs_idmap_key_type.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = keyring_alloc(".cifs_idmap",
+				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				KEY_USR_VIEW | KEY_USR_READ,
+				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = register_key_type(&cifs_idmap_key_type);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	root_cred = cred;
+
+	cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void
+exit_cifs_idmap(void)
+{
+	key_revoke(root_cred->thread_keyring);
+	unregister_key_type(&cifs_idmap_key_type);
+	put_cred(root_cred);
+	cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name);
+}
 
 /* copy ntsd, owner sid, and group sid from a security descriptor to another */
 static void copy_sec_desc(const struct cifs_ntsd *pntsd,
 				struct cifs_ntsd *pnntsd, __u32 sidsoffset)
 {
-	int i;
-
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
 	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
 
@@ -152,26 +417,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
 	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
 	nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
-
-	nowner_sid_ptr->revision = owner_sid_ptr->revision;
-	nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
-	for (i = 0; i < 6; i++)
-		nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
-	for (i = 0; i < 5; i++)
-		nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
+	cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
 
 	/* copy group sid */
 	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->gsidoffset));
 	ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
 					sizeof(struct cifs_sid));
-
-	ngroup_sid_ptr->revision = group_sid_ptr->revision;
-	ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
-	for (i = 0; i < 6; i++)
-		ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
-	for (i = 0; i < 5; i++)
-		ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
+	cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
 
 	return;
 }
@@ -209,14 +462,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			*pbits_to_set &= ~S_IXUGO;
 		return;
 	} else if (type != ACCESS_ALLOWED) {
-		cERROR(1, "unknown access control type %d", type);
+		cifs_dbg(VFS, "unknown access control type %d\n", type);
 		return;
 	}
 	/* else ACCESS_ALLOWED type */
 
 	if (flags & GENERIC_ALL) {
 		*pmode |= (S_IRWXUGO & (*pbits_to_set));
-		cFYI(DBG2, "all perms");
+		cifs_dbg(NOISY, "all perms\n");
 		return;
 	}
 	if ((flags & GENERIC_WRITE) ||
@@ -229,7 +482,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & (*pbits_to_set));
 
-	cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
+	cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
 	return;
 }
 
@@ -258,7 +511,8 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
 	if (mode & S_IXUGO)
 		*pace_flags |= SET_FILE_EXEC_RIGHTS;
 
-	cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
+	cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+		 mode, *pace_flags);
 	return;
 }
 
@@ -278,7 +532,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
 
 	pntace->sid.revision = psid->revision;
 	pntace->sid.num_subauth = psid->num_subauth;
-	for (i = 0; i < 6; i++)
+	for (i = 0; i < NUM_AUTHS; i++)
 		pntace->sid.authority[i] = psid->authority[i];
 	for (i = 0; i < psid->num_subauth; i++)
 		pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -298,24 +552,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 	/* validate that we do not go past end of acl */
 
 	if (le16_to_cpu(pace->size) < 16) {
-		cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
+		cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size));
 		return;
 	}
 
 	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-		cERROR(1, "ACL too small to parse ACE");
+		cifs_dbg(VFS, "ACL too small to parse ACE\n");
 		return;
 	}
 
 	num_subauth = pace->sid.num_subauth;
 	if (num_subauth) {
 		int i;
-		cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
-			pace->sid.revision, pace->sid.num_subauth, pace->type,
-			pace->flags, le16_to_cpu(pace->size));
+		cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n",
+			 pace->sid.revision, pace->sid.num_subauth, pace->type,
+			 pace->flags, le16_to_cpu(pace->size));
 		for (i = 0; i < num_subauth; ++i) {
-			cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
-				le32_to_cpu(pace->sid.sub_auth[i]));
+			cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n",
+				 i, le32_to_cpu(pace->sid.sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
@@ -348,13 +602,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	/* validate that we do not go past end of acl */
 	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-		cERROR(1, "ACL too small to parse DACL");
+		cifs_dbg(VFS, "ACL too small to parse DACL\n");
 		return;
 	}
 
-	cFYI(DBG2, "DACL revision %d size %d num aces %d",
-		le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-		le32_to_cpu(pdacl->num_aces));
+	cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n",
+		 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
+		 le32_to_cpu(pdacl->num_aces));
 
 	/* reset rwx permissions for user/group/other.
 	   Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -365,39 +619,39 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 	acl_size = sizeof(struct cifs_acl);
 
 	num_aces = le32_to_cpu(pdacl->num_aces);
-	if (num_aces  > 0) {
+	if (num_aces > 0) {
 		umode_t user_mask = S_IRWXU;
 		umode_t group_mask = S_IRWXG;
 		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
 
+		if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+			return;
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
-		if (!ppace) {
-			cERROR(1, "DACL memory allocation error");
+		if (!ppace)
 			return;
-		}
 
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
 #ifdef CONFIG_CIFS_DEBUG2
 			dump_ace(ppace[i], end_of_acl);
 #endif
-			if (compare_sids(&(ppace[i]->sid), pownersid))
+			if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &user_mask);
-			if (compare_sids(&(ppace[i]->sid), pgrpsid))
+			if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &group_mask);
-			if (compare_sids(&(ppace[i]->sid), &sid_everyone))
+			if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
 						     &other_mask);
-			if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+			if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
 				access_flags_to_mode(ppace[i]->access_req,
 						     ppace[i]->type,
 						     &fattr->cf_mode,
@@ -448,37 +702,37 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 	/* validate that we do not go past end of ACL - sid must be at least 8
 	   bytes long (assuming no sub-auths - e.g. the null SID */
 	if (end_of_acl < (char *)psid + 8) {
-		cERROR(1, "ACL too small to parse SID %p", psid);
+		cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid);
 		return -EINVAL;
 	}
 
-	if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
+	if (psid->num_subauth) {
 		int i;
-		cFYI(1, "SID revision %d num_auth %d",
-			psid->revision, psid->num_subauth);
+		cifs_dbg(FYI, "SID revision %d num_auth %d\n",
+			 psid->revision, psid->num_subauth);
 
 		for (i = 0; i < psid->num_subauth; i++) {
-			cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
-				le32_to_cpu(psid->sub_auth[i]));
+			cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n",
+				 i, le32_to_cpu(psid->sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
 			num auths and therefore go off the end */
-		cFYI(1, "RID 0x%x",
-			le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
-#endif
+		cifs_dbg(FYI, "RID 0x%x\n",
+			 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 	}
+#endif
 
 	return 0;
 }
 
 
 /* Convert CIFS ACL to POSIX form */
-static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
-			  struct cifs_fattr *fattr)
+static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
+		struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
 {
-	int rc;
+	int rc = 0;
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
 	struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
 	char *end_of_acl = ((char *)pntsd) + acl_len;
@@ -493,93 +747,143 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 				le32_to_cpu(pntsd->gsidoffset));
 	dacloffset = le32_to_cpu(pntsd->dacloffset);
 	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-	cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
-		 "sacloffset 0x%x dacloffset 0x%x",
+	cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
 		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
 		 le32_to_cpu(pntsd->gsidoffset),
 		 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
-	if (rc)
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n",
+			 __func__, rc);
 		return rc;
+	}
 
 	rc = parse_sid(group_sid_ptr, end_of_acl);
-	if (rc)
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n",
+			 __func__, rc);
+		return rc;
+	}
+	rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
+	if (rc) {
+		cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n",
+			 __func__, rc);
 		return rc;
+	}
 
 	if (dacloffset)
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
 			   group_sid_ptr, fattr);
 	else
-		cFYI(1, "no ACL"); /* BB grant all or default perms? */
-
-/*	cifscred->uid = owner_sid_ptr->rid;
-	cifscred->gid = group_sid_ptr->rid;
-	memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
-			sizeof(struct cifs_sid));
-	memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
-			sizeof(struct cifs_sid)); */
+		cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */
 
-	return 0;
+	return rc;
 }
 
-
 /* Convert permission bits from mode to equivalent CIFS ACL */
 static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
-				struct inode *inode, __u64 nmode)
+	__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)
 {
 	int rc = 0;
 	__u32 dacloffset;
 	__u32 ndacloffset;
 	__u32 sidsoffset;
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
 	struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
 	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
 
-	if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
-		return -EIO;
-
-	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+	if (nmode != NO_CHANGE_64) { /* chmod */
+		owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
-	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+		group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->gsidoffset));
-
-	dacloffset = le32_to_cpu(pntsd->dacloffset);
-	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-
-	ndacloffset = sizeof(struct cifs_ntsd);
-	ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
-	ndacl_ptr->revision = dacl_ptr->revision;
-	ndacl_ptr->size = 0;
-	ndacl_ptr->num_aces = 0;
-
-	rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
-
-	sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
-
-	/* copy security descriptor control portion and owner and group sid */
-	copy_sec_desc(pntsd, pnntsd, sidsoffset);
+		dacloffset = le32_to_cpu(pntsd->dacloffset);
+		dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+		ndacloffset = sizeof(struct cifs_ntsd);
+		ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+		ndacl_ptr->revision = dacl_ptr->revision;
+		ndacl_ptr->size = 0;
+		ndacl_ptr->num_aces = 0;
+
+		rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+					nmode);
+		sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+		/* copy sec desc control portion & owner and group sids */
+		copy_sec_desc(pntsd, pnntsd, sidsoffset);
+		*aclflag = CIFS_ACL_DACL;
+	} else {
+		memcpy(pnntsd, pntsd, secdesclen);
+		if (uid_valid(uid)) { /* chown */
+			uid_t id;
+			owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+					le32_to_cpu(pnntsd->osidoffset));
+			nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+								GFP_KERNEL);
+			if (!nowner_sid_ptr)
+				return -ENOMEM;
+			id = from_kuid(&init_user_ns, uid);
+			rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
+					 __func__, rc, id);
+				kfree(nowner_sid_ptr);
+				return rc;
+			}
+			cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
+			kfree(nowner_sid_ptr);
+			*aclflag = CIFS_ACL_OWNER;
+		}
+		if (gid_valid(gid)) { /* chgrp */
+			gid_t id;
+			group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+					le32_to_cpu(pnntsd->gsidoffset));
+			ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+								GFP_KERNEL);
+			if (!ngroup_sid_ptr)
+				return -ENOMEM;
+			id = from_kgid(&init_user_ns, gid);
+			rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
+					 __func__, rc, id);
+				kfree(ngroup_sid_ptr);
+				return rc;
+			}
+			cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
+			kfree(ngroup_sid_ptr);
+			*aclflag = CIFS_ACL_GROUP;
+		}
+	}
 
 	return rc;
 }
 
-static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
-		__u16 fid, u32 *pacllen)
+struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+		const struct cifs_fid *cifsfid, u32 *pacllen)
 {
 	struct cifs_ntsd *pntsd = NULL;
-	int xid, rc;
+	unsigned int xid;
+	int rc;
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
 
 	if (IS_ERR(tlink))
 		return ERR_CAST(tlink);
 
-	xid = GetXid();
-	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
-	FreeXid(xid);
+	xid = get_xid();
+	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
+				pacllen);
+	free_xid(xid);
 
 	cifs_put_tlink(tlink);
 
-	cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+	cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
 	if (rc)
 		return ERR_PTR(rc);
 	return pntsd;
@@ -590,29 +894,41 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 {
 	struct cifs_ntsd *pntsd = NULL;
 	int oplock = 0;
-	int xid, rc;
-	__u16 fid;
-	struct cifsTconInfo *tcon;
+	unsigned int xid;
+	int rc, create_options = 0;
+	struct cifs_tcon *tcon;
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 
 	if (IS_ERR(tlink))
 		return ERR_CAST(tlink);
 
 	tcon = tlink_tcon(tlink);
-	xid = GetXid();
+	xid = get_xid();
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = READ_CONTROL;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
-			 &fid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (!rc) {
-		rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-		CIFSSMBClose(xid, tcon, fid);
+		rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+		CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 
 	cifs_put_tlink(tlink);
-	FreeXid(xid);
+	free_xid(xid);
 
-	cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+	cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
 	if (rc)
 		return ERR_PTR(rc);
 	return pntsd;
@@ -631,157 +947,174 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	if (!open_file)
 		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
 
-	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
+	pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
 	cifsFileInfo_put(open_file);
 	return pntsd;
 }
 
-static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
-		struct cifs_ntsd *pnntsd, u32 acllen)
-{
-	int xid, rc;
-	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
-
-	if (IS_ERR(tlink))
-		return PTR_ERR(tlink);
-
-	xid = GetXid();
-	rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
-	FreeXid(xid);
-	cifs_put_tlink(tlink);
-
-	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
-	return rc;
-}
-
-static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
-		struct cifs_ntsd *pnntsd, u32 acllen)
+ /* Set an ACL on the server */
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+			struct inode *inode, const char *path, int aclflag)
 {
 	int oplock = 0;
-	int xid, rc;
-	__u16 fid;
-	struct cifsTconInfo *tcon;
+	unsigned int xid;
+	int rc, access_flags, create_options = 0;
+	struct cifs_tcon *tcon;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 
 	tcon = tlink_tcon(tlink);
-	xid = GetXid();
+	xid = get_xid();
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
-			 &fid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+		access_flags = WRITE_OWNER;
+	else
+		access_flags = WRITE_DAC;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = access_flags;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc) {
-		cERROR(1, "Unable to open file to set ACL");
+		cifs_dbg(VFS, "Unable to open file to set ACL\n");
 		goto out;
 	}
 
-	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
-	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag);
+	cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
 
-	CIFSSMBClose(xid, tcon, fid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 out:
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
 
-/* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-				struct inode *inode, const char *path)
-{
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsFileInfo *open_file;
-	int rc;
-
-	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-
-	open_file = find_readable_file(CIFS_I(inode), true);
-	if (!open_file)
-		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-
-	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-	cifsFileInfo_put(open_file);
-	return rc;
-}
-
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
 int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
-		  struct inode *inode, const char *path, const __u16 *pfid)
+		  struct inode *inode, const char *path,
+		  const struct cifs_fid *pfid)
 {
 	struct cifs_ntsd *pntsd = NULL;
 	u32 acllen = 0;
 	int rc = 0;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_tcon *tcon;
 
-	cFYI(DBG2, "converting ACL to mode for %s", path);
+	cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
 
-	if (pfid)
-		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
-	else
-		pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
 
+	if (pfid && (tcon->ses->server->ops->get_acl_by_fid))
+		pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid,
+							  &acllen);
+	else if (tcon->ses->server->ops->get_acl)
+		pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+							&acllen);
+	else {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
 	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
 	if (IS_ERR(pntsd)) {
 		rc = PTR_ERR(pntsd);
-		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
 	} else {
-		rc = parse_sec_desc(pntsd, acllen, fattr);
+		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
 		kfree(pntsd);
 		if (rc)
-			cERROR(1, "parse sec desc failed rc = %d", rc);
+			cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
 	}
 
+	cifs_put_tlink(tlink);
+
 	return rc;
 }
 
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
+int
+id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
+			kuid_t uid, kgid_t gid)
 {
 	int rc = 0;
+	int aclflag = CIFS_ACL_DACL; /* default flag to set */
 	__u32 secdesclen = 0;
 	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
 	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_tcon *tcon;
 
-	cFYI(DBG2, "set ACL from mode for %s", path);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
 
 	/* Get the security descriptor */
-	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
 
-	/* Add three ACEs for owner, group, everyone getting rid of
-	   other ACEs as chmod disables ACEs and set the security descriptor */
+	if (tcon->ses->server->ops->get_acl == NULL) {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
 
+	pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+						&secdesclen);
 	if (IS_ERR(pntsd)) {
 		rc = PTR_ERR(pntsd);
-		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
-	} else {
-		/* allocate memory for the smb header,
-		   set security descriptor request security descriptor
-		   parameters, and secuirty descriptor itself */
-
-		secdesclen = secdesclen < DEFSECDESCLEN ?
-					DEFSECDESCLEN : secdesclen;
-		pnntsd = kmalloc(secdesclen, GFP_KERNEL);
-		if (!pnntsd) {
-			cERROR(1, "Unable to allocate security descriptor");
-			kfree(pntsd);
-			return -ENOMEM;
-		}
+		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
+		cifs_put_tlink(tlink);
+		return rc;
+	}
+
+	/*
+	 * Add three ACEs for owner, group, everyone getting rid of other ACEs
+	 * as chmod disables ACEs and set the security descriptor. Allocate
+	 * memory for the smb header, set security descriptor request security
+	 * descriptor parameters, and secuirty descriptor itself
+	 */
+	secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
+	pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+	if (!pnntsd) {
+		kfree(pntsd);
+		cifs_put_tlink(tlink);
+		return -ENOMEM;
+	}
 
-		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
+	rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+				&aclflag);
 
-		cFYI(DBG2, "build_sec_desc rc: %d", rc);
+	cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
 
-		if (!rc) {
-			/* Set the security descriptor */
-			rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
-			cFYI(DBG2, "set_cifs_acl rc: %d", rc);
-		}
+	if (tcon->ses->server->ops->set_acl == NULL)
+		rc = -EOPNOTSUPP;
 
-		kfree(pnntsd);
-		kfree(pntsd);
+	if (!rc) {
+		/* Set the security descriptor */
+		rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
+						     path, aclflag);
+		cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
 	}
+	cifs_put_tlink(tlink);
 
+	kfree(pnntsd);
+	kfree(pntsd);
 	return rc;
 }
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index c4ae7d03656..4f388483526 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
 #define _CIFSACL_H
 
 
-#define NUM_AUTHS 6 /* number of authority fields */
-#define NUM_SUBAUTHS 5 /* number of sub authority fields */
-#define NUM_WK_SIDS 7 /* number of well known sids */
-#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
-#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
+#define NUM_AUTHS (6)	/* number of authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
 
 #define READ_BIT        0x4
 #define WRITE_BIT       0x2
@@ -39,6 +36,35 @@
 #define ACCESS_ALLOWED	0
 #define ACCESS_DENIED	1
 
+#define SIDOWNER 1
+#define SIDGROUP 2
+
+/*
+ * Security Descriptor length containing DACL with 3 ACEs (one each for
+ * owner, group and world).
+ */
+#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
+			      sizeof(struct cifs_acl) + \
+			      (sizeof(struct cifs_ace) * 3))
+
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8:  max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
+
 struct cifs_ntsd {
 	__le16 revision; /* revision level */
 	__le16 type;
@@ -51,10 +77,13 @@ struct cifs_ntsd {
 struct cifs_sid {
 	__u8 revision; /* revision level */
 	__u8 num_subauth;
-	__u8 authority[6];
-	__le32 sub_auth[5]; /* sub_auth[num_subauth] */
+	__u8 authority[NUM_AUTHS];
+	__le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
 } __attribute__((packed));
 
+/* size of a struct cifs_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
+
 struct cifs_acl {
 	__le16 revision; /* revision level */
 	__le16 size;
@@ -69,12 +98,4 @@ struct cifs_ace {
 	struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
 } __attribute__((packed));
 
-struct cifs_wksid {
-	struct cifs_sid cifssid;
-	char sidname[SIDNAMELENGTH];
-} __attribute__((packed));
-
-extern int match_sid(struct cifs_sid *);
-extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index a51585f9852..4934347321d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsencrypt.c
  *
- *   Copyright (C) International Business Machines  Corp., 2005,2006
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -29,100 +29,83 @@
 #include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
+#include <linux/highmem.h>
 
-/* Calculate and return the CIFS signature based on the mac key and SMB PDU */
-/* the 16 byte signature must be allocated by the caller  */
-/* Note we only use the 1st eight bytes */
-/* Note that the smb header signature field on input contains the
-	sequence number before this function is called */
-
-static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-				struct TCP_Server_Info *server, char *signature)
+static int
+cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
 {
 	int rc;
+	unsigned int size;
 
-	if (cifs_pdu == NULL || signature == NULL || server == NULL)
-		return -EINVAL;
-
-	if (!server->secmech.sdescmd5) {
-		cERROR(1, "%s: Can't generate signature\n", __func__);
-		return -1;
-	}
+	if (server->secmech.sdescmd5 != NULL)
+		return 0; /* already allocated */
 
-	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
-	if (rc) {
-		cERROR(1, "%s: Oould not init md5\n", __func__);
+	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+	if (IS_ERR(server->secmech.md5)) {
+		cifs_dbg(VFS, "could not allocate crypto md5\n");
+		rc = PTR_ERR(server->secmech.md5);
+		server->secmech.md5 = NULL;
 		return rc;
 	}
 
-	crypto_shash_update(&server->secmech.sdescmd5->shash,
-		server->session_key.response, server->session_key.len);
-
-	crypto_shash_update(&server->secmech.sdescmd5->shash,
-		cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
-
-	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.md5);
+	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdescmd5) {
+		crypto_free_shash(server->secmech.md5);
+		server->secmech.md5 = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+	server->secmech.sdescmd5->shash.flags = 0x0;
 
 	return 0;
 }
 
-/* must be called with server->srv_mutex held */
-int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
-		  __u32 *pexpected_response_sequence_number)
-{
-	int rc = 0;
-	char smb_signature[20];
-
-	if ((cifs_pdu == NULL) || (server == NULL))
-		return -EINVAL;
-
-	if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
-		return rc;
-
-	cifs_pdu->Signature.Sequence.SequenceNumber =
-			cpu_to_le32(server->sequence_number);
-	cifs_pdu->Signature.Sequence.Reserved = 0;
-
-	*pexpected_response_sequence_number = server->sequence_number++;
-	server->sequence_number++;
-
-	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-	if (rc)
-		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
-	else
-		memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
-
-	return rc;
-}
-
-static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-				struct TCP_Server_Info *server, char *signature)
+/*
+ * Calculate and return the CIFS signature based on the mac key and SMB PDU.
+ * The 16 byte signature must be allocated by the caller. Note we only use the
+ * 1st eight bytes and that the smb header signature field on input contains
+ * the sequence number before this function is called. Also, this function
+ * should be called with the server->srv_mutex held.
+ */
+static int cifs_calc_signature(struct smb_rqst *rqst,
+			struct TCP_Server_Info *server, char *signature)
 {
 	int i;
 	int rc;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
 
 	if (iov == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
 	if (!server->secmech.sdescmd5) {
-		cERROR(1, "%s: Can't generate signature\n", __func__);
-		return -1;
+		rc = cifs_crypto_shash_md5_allocate(server);
+		if (rc) {
+			cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
+			return -1;
+		}
 	}
 
 	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
 	if (rc) {
-		cERROR(1, "%s: Oould not init md5\n", __func__);
+		cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
 		return rc;
 	}
 
-	crypto_shash_update(&server->secmech.sdescmd5->shash,
+	rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
 		server->session_key.response, server->session_key.len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		return rc;
+	}
 
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
-			cERROR(1, "null iovec entry");
+			cifs_dbg(VFS, "null iovec entry\n");
 			return -EIO;
 		}
 		/* The first entry includes a length field (which does not get
@@ -130,40 +113,66 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
+			rc =
 			crypto_shash_update(&server->secmech.sdescmd5->shash,
 				iov[i].iov_base + 4, iov[i].iov_len - 4);
-		} else
+		} else {
+			rc =
 			crypto_shash_update(&server->secmech.sdescmd5->shash,
 				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with payload\n",
+				 __func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdescmd5->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
 	}
 
 	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 	return rc;
 }
 
 /* must be called with server->srv_mutex held */
-int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
+int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
 		   __u32 *pexpected_response_sequence_number)
 {
 	int rc = 0;
 	char smb_signature[20];
-	struct smb_hdr *cifs_pdu = iov[0].iov_base;
+	struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 
 	if ((cifs_pdu == NULL) || (server == NULL))
 		return -EINVAL;
 
-	if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
+	if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
+	    server->tcpStatus == CifsNeedNegotiate)
 		return rc;
 
+	if (!server->session_estab) {
+		memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+		return rc;
+	}
+
 	cifs_pdu->Signature.Sequence.SequenceNumber =
 				cpu_to_le32(server->sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	*pexpected_response_sequence_number = server->sequence_number++;
-	server->sequence_number++;
+	*pexpected_response_sequence_number = ++server->sequence_number;
+	++server->sequence_number;
 
-	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
+	rc = cifs_calc_signature(rqst, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -172,18 +181,41 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	return rc;
 }
 
-int cifs_verify_signature(struct smb_hdr *cifs_pdu,
+int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
+		   __u32 *pexpected_response_sequence)
+{
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = n_vec };
+
+	return cifs_sign_rqst(&rqst, server, pexpected_response_sequence);
+}
+
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+		  __u32 *pexpected_response_sequence_number)
+{
+	struct kvec iov;
+
+	iov.iov_base = cifs_pdu;
+	iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+
+	return cifs_sign_smbv(&iov, 1, server,
+			      pexpected_response_sequence_number);
+}
+
+int cifs_verify_signature(struct smb_rqst *rqst,
 			  struct TCP_Server_Info *server,
 			  __u32 expected_sequence_number)
 {
 	unsigned int rc;
 	char server_response_sig[8];
 	char what_we_think_sig_should_be[20];
+	struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 
 	if (cifs_pdu == NULL || server == NULL)
 		return -EINVAL;
 
-	if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
+	if (!server->session_estab)
 		return 0;
 
 	if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) {
@@ -198,8 +230,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 
 	/* Do not need to verify session setups with signature "BSRSPYL "  */
 	if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-		cFYI(1, "dummy signature received for smb command 0x%x",
-			cifs_pdu->Command);
+		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+			 cifs_pdu->Command);
 
 	/* save off the origiginal signature so we can modify the smb and check
 		its signature against what the server sent */
@@ -209,8 +241,9 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 					cpu_to_le32(expected_sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	rc = cifs_calculate_signature(cifs_pdu, server,
-		what_we_think_sig_should_be);
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be);
+	mutex_unlock(&server->srv_mutex);
 
 	if (rc)
 		return rc;
@@ -226,7 +259,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
 
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifsSesInfo *ses)
+int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
 	int rc = 0;
 	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -236,39 +269,40 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
 		return -EINVAL;
 
 	ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
-	if (!ses->auth_key.response) {
-		cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
+	if (!ses->auth_key.response)
 		return -ENOMEM;
-	}
+
 	ses->auth_key.len = temp_len;
 
 	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
-			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+			ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
 	if (rc) {
-		cFYI(1, "%s Can't generate NTLM response, error: %d",
-			__func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n",
+			 __func__, rc);
 		return rc;
 	}
 
-	rc = E_md4hash(ses->password, temp_key);
+	rc = E_md4hash(ses->password, temp_key, nls_cp);
 	if (rc) {
-		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+			 __func__, rc);
 		return rc;
 	}
 
 	rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
 	if (rc)
-		cFYI(1, "%s Can't generate NTLM session key, error: %d",
-			__func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n",
+			 __func__, rc);
 
 	return rc;
 }
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 			char *lnm_session_key)
 {
 	int i;
+	int rc;
 	char password_with_pad[CIFS_ENCPWD_SIZE];
 
 	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
@@ -276,10 +310,9 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
 
 	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
-		memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
 		memcpy(lnm_session_key, password_with_pad,
 			CIFS_ENCPWD_SIZE);
-		return;
+		return 0;
 	}
 
 	/* calculate old style session key */
@@ -296,10 +329,9 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 	for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
 		password_with_pad[i] = toupper(password_with_pad[i]);
 
-	SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
+	rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
 
-	/* clear password before we return/free memory */
-	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+	return rc;
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
@@ -309,12 +341,10 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
  * Allocate domain name which gets freed when session struct is deallocated.
  */
 static int
-build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
 	unsigned int dlen;
-	unsigned int wlen;
-	unsigned int size = 6 * sizeof(struct ntlmssp2_name);
-	__le64  curtime;
+	unsigned int size = 2 * sizeof(struct ntlmssp2_name);
 	char *defdmname = "WORKGROUP";
 	unsigned char *blobptr;
 	struct ntlmssp2_name *attrptr;
@@ -326,62 +356,31 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	}
 
 	dlen = strlen(ses->domainName);
-	wlen = strlen(ses->server->hostname);
 
-	/* The length of this blob is a size which is
-	 * six times the size of a structure which holds name/size +
-	 * two times the unicode length of a domain name +
-	 * two times the unicode length of a server name +
-	 * size of a timestamp (which is 8 bytes).
+	/*
+	 * The length of this blob is two times the size of a
+	 * structure (av pair) which holds name/size
+	 * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
+	 * unicode length of a netbios domain name
 	 */
-	ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+	ses->auth_key.len = size + 2 * dlen;
 	ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
 	if (!ses->auth_key.response) {
 		ses->auth_key.len = 0;
-		cERROR(1, "Challenge target info allocation failure");
 		return -ENOMEM;
 	}
 
 	blobptr = ses->auth_key.response;
 	attrptr = (struct ntlmssp2_name *) blobptr;
 
+	/*
+	 * As defined in MS-NTLM 3.3.2, just this av pair field
+	 * is sufficient as part of the temp
+	 */
 	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
 	attrptr->length = cpu_to_le16(2 * dlen);
 	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-
-	blobptr += 2 * dlen;
-	attrptr = (struct ntlmssp2_name *) blobptr;
-
-	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
-	attrptr->length = cpu_to_le16(2 * wlen);
-	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-
-	blobptr += 2 * wlen;
-	attrptr = (struct ntlmssp2_name *) blobptr;
-
-	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
-	attrptr->length = cpu_to_le16(2 * dlen);
-	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-
-	blobptr += 2 * dlen;
-	attrptr = (struct ntlmssp2_name *) blobptr;
-
-	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
-	attrptr->length = cpu_to_le16(2 * wlen);
-	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-
-	blobptr += 2 * wlen;
-	attrptr = (struct ntlmssp2_name *) blobptr;
-
-	attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
-	attrptr->length = cpu_to_le16(sizeof(__le64));
-	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-	curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-	memcpy(blobptr, &curtime, sizeof(__le64));
+	cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
 
 	return 0;
 }
@@ -397,7 +396,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
  * about target string i.e. for some, just user name might suffice.
  */
 static int
-find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
 	unsigned int attrsize;
 	unsigned int type;
@@ -423,14 +422,14 @@ find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 		if (blobptr + attrsize > blobend)
 			break;
 		if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
-			if (!attrsize)
+			if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
 				break;
 			if (!ses->domainName) {
 				ses->domainName =
 					kmalloc(attrsize + 1, GFP_KERNEL);
 				if (!ses->domainName)
 						return -ENOMEM;
-				cifs_from_ucs2(ses->domainName,
+				cifs_from_utf16(ses->domainName,
 					(__le16 *)blobptr, attrsize, attrsize,
 					nls_cp, false);
 				break;
@@ -442,46 +441,59 @@ find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	return 0;
 }
 
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
+static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 			    const struct nls_table *nls_cp)
 {
 	int rc = 0;
 	int len;
 	char nt_hash[CIFS_NTHASH_SIZE];
-	wchar_t *user;
+	__le16 *user;
 	wchar_t *domain;
 	wchar_t *server;
 
 	if (!ses->server->secmech.sdeschmacmd5) {
-		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
 		return -1;
 	}
 
 	/* calculate md4 hash of password */
-	E_md4hash(ses->password, nt_hash);
+	E_md4hash(ses->password, nt_hash, nls_cp);
 
-	crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
 				CIFS_NTHASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
+		return rc;
+	}
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+		cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
 		return rc;
 	}
 
-	/* convert ses->userName to unicode and uppercase */
-	len = strlen(ses->userName);
+	/* convert ses->user_name to unicode */
+	len = ses->user_name ? strlen(ses->user_name) : 0;
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
 	if (user == NULL) {
-		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
 		rc = -ENOMEM;
-		goto calc_exit_2;
+		return rc;
 	}
-	len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
-	UniStrupr(user);
 
-	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+	if (len) {
+		len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
+		UniStrupr(user);
+	} else {
+		memset(user, '\0', 2);
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 				(char *)user, 2 * len);
+	kfree(user);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with user\n", __func__);
+		return rc;
+	}
 
 	/* convert ses->domainName to unicode and uppercase */
 	if (ses->domainName) {
@@ -489,98 +501,156 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
 
 		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
 		if (domain == NULL) {
-			cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
 			rc = -ENOMEM;
-			goto calc_exit_1;
+			return rc;
 		}
-		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
-					nls_cp);
+		len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+				      nls_cp);
+		rc =
 		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 					(char *)domain, 2 * len);
 		kfree(domain);
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with domain\n",
+				 __func__);
+			return rc;
+		}
 	} else if (ses->serverName) {
 		len = strlen(ses->serverName);
 
 		server = kmalloc(2 + (len * 2), GFP_KERNEL);
 		if (server == NULL) {
-			cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
 			rc = -ENOMEM;
-			goto calc_exit_1;
+			return rc;
 		}
-		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+		len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
 					nls_cp);
+		rc =
 		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 					(char *)server, 2 * len);
 		kfree(server);
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with server\n",
+				 __func__);
+			return rc;
+		}
 	}
 
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
 					ntlmv2_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
-calc_exit_1:
-	kfree(user);
-calc_exit_2:
 	return rc;
 }
 
 static int
-CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 {
 	int rc;
-	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+	struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
+	    (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	unsigned int hash_len;
+
+	/* The MD5 hash starts at challenge_key.key */
+	hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
+		offsetof(struct ntlmv2_resp, challenge.key[0]));
 
 	if (!ses->server->secmech.sdeschmacmd5) {
-		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
 		return -1;
 	}
 
-	crypto_shash_setkey(ses->server->secmech.hmacmd5,
-				ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+				 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+			 __func__);
+		return rc;
+	}
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+		cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
 		return rc;
 	}
 
-	if (ses->server->secType == RawNTLMSSP)
-		memcpy(ses->auth_key.response + offset,
-			ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
+		memcpy(ntlmv2->challenge.key,
+		       ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
 	else
-		memcpy(ses->auth_key.response + offset,
-			ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
-	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-		ses->auth_key.response + offset, ses->auth_key.len - offset);
+		memcpy(ntlmv2->challenge.key,
+		       ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+				 ntlmv2->challenge.key, hash_len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		return rc;
+	}
 
+	/* Note that the MD5 digest over writes anon.challenge_key.key */
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
-		ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+				ntlmv2->ntlmv2_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 	return rc;
 }
 
+static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	/* check if already allocated */
+	if (server->secmech.sdeschmacmd5)
+		return 0;
+
+	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+	if (IS_ERR(server->secmech.hmacmd5)) {
+		cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
+		rc = PTR_ERR(server->secmech.hmacmd5);
+		server->secmech.hmacmd5 = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacmd5);
+	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacmd5) {
+		crypto_free_shash(server->secmech.hmacmd5);
+		server->secmech.hmacmd5 = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+	server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+	return 0;
+}
 
 int
-setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
 	int rc;
 	int baselen;
 	unsigned int tilen;
-	struct ntlmv2_resp *buf;
+	struct ntlmv2_resp *ntlmv2;
 	char ntlmv2_hash[16];
 	unsigned char *tiblob = NULL; /* target info blob */
 
-	if (ses->server->secType == RawNTLMSSP) {
+	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
 		if (!ses->domainName) {
 			rc = find_domain_name(ses, nls_cp);
 			if (rc) {
-				cERROR(1, "error %d finding domain name", rc);
+				cifs_dbg(VFS, "error %d finding domain name\n",
+					 rc);
 				goto setup_ntlmv2_rsp_ret;
 			}
 		}
 	} else {
 		rc = build_avpair_blob(ses, nls_cp);
 		if (rc) {
-			cERROR(1, "error %d building av pair blob", rc);
+			cifs_dbg(VFS, "error %d building av pair blob\n", rc);
 			goto setup_ntlmv2_rsp_ret;
 		}
 	}
@@ -593,51 +663,68 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 	if (!ses->auth_key.response) {
 		rc = ENOMEM;
 		ses->auth_key.len = 0;
-		cERROR(1, "%s: Can't allocate auth blob", __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
 	ses->auth_key.len += baselen;
 
-	buf = (struct ntlmv2_resp *)
+	ntlmv2 = (struct ntlmv2_resp *)
 			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
-	buf->blob_signature = cpu_to_le32(0x00000101);
-	buf->reserved = 0;
-	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
-	buf->reserved2 = 0;
+	ntlmv2->blob_signature = cpu_to_le32(0x00000101);
+	ntlmv2->reserved = 0;
+	/* Must be within 5 minutes of the server */
+	ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
+	ntlmv2->reserved2 = 0;
 
 	memcpy(ses->auth_key.response + baselen, tiblob, tilen);
 
+	rc = crypto_hmacmd5_alloc(ses->server);
+	if (rc) {
+		cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
 	/* calculate ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
 	if (rc) {
-		cERROR(1, "could not get v2 hash rc %d", rc);
+		cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
 	/* calculate first part of the client response (CR1) */
 	rc = CalcNTLMv2_response(ses, ntlmv2_hash);
 	if (rc) {
-		cERROR(1, "Could not calculate CR1  rc: %d", rc);
+		cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
 	/* now calculate the session key for NTLMv2 */
-	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
 		ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+			 __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+		cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
-	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-		ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ntlmv2->ntlmv2_hash,
 		CIFS_HMAC_MD5_HASH_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
 
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
 		ses->auth_key.response);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 setup_ntlmv2_rsp_ret:
 	kfree(tiblob);
@@ -646,7 +733,7 @@ setup_ntlmv2_rsp_ret:
 }
 
 int
-calc_seckey(struct cifsSesInfo *ses)
+calc_seckey(struct cifs_ses *ses)
 {
 	int rc;
 	struct crypto_blkcipher *tfm_arc4;
@@ -659,21 +746,26 @@ calc_seckey(struct cifsSesInfo *ses)
 	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm_arc4)) {
 		rc = PTR_ERR(tfm_arc4);
-		cERROR(1, "could not allocate crypto API arc4\n");
+		cifs_dbg(VFS, "could not allocate crypto API arc4\n");
 		return rc;
 	}
 
 	desc.tfm = tfm_arc4;
 
-	crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+	rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
 					CIFS_SESS_KEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set response as a key\n",
+			 __func__);
+		return rc;
+	}
 
 	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
 	sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
 
 	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
 	if (rc) {
-		cERROR(1, "could not encrypt session key rc: %d\n", rc);
+		cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
 		crypto_free_blkcipher(tfm_arc4);
 		return rc;
 	}
@@ -685,75 +777,38 @@ calc_seckey(struct cifsSesInfo *ses)
 
 	crypto_free_blkcipher(tfm_arc4);
 
-	return 0;
+	return rc;
 }
 
 void
 cifs_crypto_shash_release(struct TCP_Server_Info *server)
 {
-	if (server->secmech.md5)
-		crypto_free_shash(server->secmech.md5);
-
-	if (server->secmech.hmacmd5)
-		crypto_free_shash(server->secmech.hmacmd5);
-
-	kfree(server->secmech.sdeschmacmd5);
-
-	kfree(server->secmech.sdescmd5);
-}
-
-int
-cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
-	int rc;
-	unsigned int size;
-
-	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-	if (IS_ERR(server->secmech.hmacmd5)) {
-		cERROR(1, "could not allocate crypto hmacmd5\n");
-		return PTR_ERR(server->secmech.hmacmd5);
+	if (server->secmech.cmacaes) {
+		crypto_free_shash(server->secmech.cmacaes);
+		server->secmech.cmacaes = NULL;
 	}
 
-	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
-	if (IS_ERR(server->secmech.md5)) {
-		cERROR(1, "could not allocate crypto md5\n");
-		rc = PTR_ERR(server->secmech.md5);
-		goto crypto_allocate_md5_fail;
+	if (server->secmech.hmacsha256) {
+		crypto_free_shash(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
 	}
 
-	size = sizeof(struct shash_desc) +
-			crypto_shash_descsize(server->secmech.hmacmd5);
-	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
-	if (!server->secmech.sdeschmacmd5) {
-		cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
-		rc = -ENOMEM;
-		goto crypto_allocate_hmacmd5_sdesc_fail;
+	if (server->secmech.md5) {
+		crypto_free_shash(server->secmech.md5);
+		server->secmech.md5 = NULL;
 	}
-	server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
-	server->secmech.sdeschmacmd5->shash.flags = 0x0;
 
-
-	size = sizeof(struct shash_desc) +
-			crypto_shash_descsize(server->secmech.md5);
-	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
-	if (!server->secmech.sdescmd5) {
-		cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
-		rc = -ENOMEM;
-		goto crypto_allocate_md5_sdesc_fail;
+	if (server->secmech.hmacmd5) {
+		crypto_free_shash(server->secmech.hmacmd5);
+		server->secmech.hmacmd5 = NULL;
 	}
-	server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
-	server->secmech.sdescmd5->shash.flags = 0x0;
 
-	return 0;
-
-crypto_allocate_md5_sdesc_fail:
+	kfree(server->secmech.sdesccmacaes);
+	server->secmech.sdesccmacaes = NULL;
+	kfree(server->secmech.sdeschmacsha256);
+	server->secmech.sdeschmacsha256 = NULL;
 	kfree(server->secmech.sdeschmacmd5);
-
-crypto_allocate_hmacmd5_sdesc_fail:
-	crypto_free_shash(server->secmech.md5);
-
-crypto_allocate_md5_fail:
-	crypto_free_shash(server->secmech.hmacmd5);
-
-	return rc;
+	server->secmech.sdeschmacmd5 = NULL;
+	kfree(server->secmech.sdescmd5);
+	server->secmech.sdescmd5 = NULL;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f2970136d17..88839806742 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,6 +35,8 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/namei.h>
+#include <linux/random.h>
 #include <net/ipv6.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -47,45 +49,50 @@
 #include <linux/key-type.h>
 #include "cifs_spnego.h"
 #include "fscache.h"
-#define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2pdu.h"
+#endif
 
 int cifsFYI = 0;
-int cifsERROR = 1;
 int traceSMB = 0;
-unsigned int oplockEnabled = 1;
-unsigned int experimEnabled = 0;
+bool enable_oplocks = true;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
-unsigned int multiuser_mount = 0;
 unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
-module_param(CIFSMaxBufSize, int, 0);
+module_param(CIFSMaxBufSize, uint, 0);
 MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
 				 "Default: 16384 Range: 8192 to 130048");
 unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
-module_param(cifs_min_rcv, int, 0);
+module_param(cifs_min_rcv, uint, 0);
 MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
 				"1 to 64");
 unsigned int cifs_min_small = 30;
-module_param(cifs_min_small, int, 0);
+module_param(cifs_min_small, uint, 0);
 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
 				 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0);
+module_param(cifs_max_pending, uint, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
-				   "Default: 50 Range: 2 to 256");
-unsigned short echo_retries = 5;
-module_param(echo_retries, ushort, 0644);
-MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
-			       "reconnecting server. Default: 5. 0 means "
-			       "never reconnect.");
+				   "Default: 32767 Range: 2 to 32767.");
+module_param(enable_oplocks, bool, 0644);
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
+
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
+struct workqueue_struct	*cifsiod_wq;
+
+/*
+ * Bumps refcount for cifs super block.
+ * Note that it should be only called if a referece to VFS super block is
+ * already held, e.g. in open-type syscalls context. Otherwise it can race with
+ * atomic_dec_and_test in deactivate_locked_super.
+ */
 void
 cifs_sb_active(struct super_block *sb)
 {
@@ -105,142 +112,69 @@ cifs_sb_deactive(struct super_block *sb)
 }
 
 static int
-cifs_read_super(struct super_block *sb, void *data,
-		const char *devname, int silent)
+cifs_read_super(struct super_block *sb)
 {
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
 	int rc = 0;
 
-	/* BB should we make this contingent on mount parm? */
-	sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
-	sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
 	cifs_sb = CIFS_SB(sb);
-	if (cifs_sb == NULL)
-		return -ENOMEM;
-
-	spin_lock_init(&cifs_sb->tlink_tree_lock);
-	cifs_sb->tlink_tree = RB_ROOT;
-
-	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
-	if (rc) {
-		kfree(cifs_sb);
-		return rc;
-	}
+	tcon = cifs_sb_master_tcon(cifs_sb);
 
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	/* copy mount params to sb for use in submounts */
-	/* BB: should we move this after the mount so we
-	 * do not have to do the copy on failed mounts?
-	 * BB: May be it is better to do simple copy before
-	 * complex operation (mount), and in case of fail
-	 * just exit instead of doing mount and attempting
-	 * undo it if this copy fails?*/
-	if (data) {
-		int len = strlen(data);
-		cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
-		if (cifs_sb->mountdata == NULL) {
-			bdi_destroy(&cifs_sb->bdi);
-			kfree(sb->s_fs_info);
-			sb->s_fs_info = NULL;
-			return -ENOMEM;
-		}
-		strncpy(cifs_sb->mountdata, data, len + 1);
-		cifs_sb->mountdata[len] = '\0';
-	}
-#endif
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
+		sb->s_flags |= MS_POSIXACL;
 
-	rc = cifs_mount(sb, cifs_sb, data, devname);
+	if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+	else
+		sb->s_maxbytes = MAX_NON_LFS;
 
-	if (rc) {
-		if (!silent)
-			cERROR(1, "cifs_mount failed w/return code = %d", rc);
-		goto out_mount_failed;
-	}
+	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
+	sb->s_time_gran = 100;
 
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
 	sb->s_bdi = &cifs_sb->bdi;
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
-	inode = cifs_root_iget(sb, ROOT_I);
+	inode = cifs_root_iget(sb);
 
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
-		inode = NULL;
 		goto out_no_root;
 	}
 
-	sb->s_root = d_alloc_root(inode);
+	if (tcon->nocase)
+		sb->s_d_op = &cifs_ci_dentry_ops;
+	else
+		sb->s_d_op = &cifs_dentry_ops;
 
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
 		rc = -ENOMEM;
 		goto out_no_root;
 	}
 
-	/* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
-	if (cifs_sb_master_tcon(cifs_sb)->nocase)
-		sb->s_d_op = &cifs_ci_dentry_ops;
-	else
-		sb->s_d_op = &cifs_dentry_ops;
-
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-		cFYI(1, "export ops supported");
+		cifs_dbg(FYI, "export ops supported\n");
 		sb->s_export_op = &cifs_export_ops;
 	}
-#endif /* EXPERIMENTAL */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 	return 0;
 
 out_no_root:
-	cERROR(1, "cifs_read_super: get root inode failed");
-	if (inode)
-		iput(inode);
-
-	cifs_umount(sb, cifs_sb);
-
-out_mount_failed:
-	if (cifs_sb) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
-		if (cifs_sb->mountdata) {
-			kfree(cifs_sb->mountdata);
-			cifs_sb->mountdata = NULL;
-		}
-#endif
-		unload_nls(cifs_sb->local_nls);
-		bdi_destroy(&cifs_sb->bdi);
-		kfree(cifs_sb);
-	}
+	cifs_dbg(VFS, "%s: get root inode failed\n", __func__);
 	return rc;
 }
 
-static void
-cifs_put_super(struct super_block *sb)
+static void cifs_kill_sb(struct super_block *sb)
 {
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb;
-
-	cFYI(1, "In cifs_put_super");
-	cifs_sb = CIFS_SB(sb);
-	if (cifs_sb == NULL) {
-		cFYI(1, "Empty cifs superblock info passed to unmount");
-		return;
-	}
-
-	rc = cifs_umount(sb, cifs_sb);
-	if (rc)
-		cERROR(1, "cifs_umount failed with return code %d", rc);
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	if (cifs_sb->mountdata) {
-		kfree(cifs_sb->mountdata);
-		cifs_sb->mountdata = NULL;
-	}
-#endif
-
-	unload_nls(cifs_sb->local_nls);
-	bdi_destroy(&cifs_sb->bdi);
-	kfree(cifs_sb);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	kill_anon_super(sb);
+	cifs_umount(cifs_sb);
 }
 
 static int
@@ -248,13 +182,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-	int rc = -EOPNOTSUPP;
-	int xid;
-
-	xid = GetXid();
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int xid;
+	int rc = 0;
 
-	buf->f_type = CIFS_MAGIC_NUMBER;
+	xid = get_xid();
 
 	/*
 	 * PATH_MAX may be too long - it would presumably be total path,
@@ -267,39 +200,17 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;	/* undefined */
 	buf->f_ffree = 0;	/* unlimited */
 
-	/*
-	 * We could add a second check for a QFS Unix capability bit
-	 */
-	if ((tcon->ses->capabilities & CAP_UNIX) &&
-	    (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
-		rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
-
-	/*
-	 * Only need to call the old QFSInfo if failed on newer one,
-	 * e.g. by OS/2.
-	 **/
-	if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
-		rc = CIFSSMBQFSInfo(xid, tcon, buf);
-
-	/*
-	 * Some old Windows servers also do not support level 103, retry with
-	 * older level one if old server failed the previous call or we
-	 * bypassed it because we detected that this was an older LANMAN sess
-	 */
-	if (rc)
-		rc = SMBOldQFSInfo(xid, tcon, buf);
+	if (server->ops->queryfs)
+		rc = server->ops->queryfs(xid, tcon, buf);
 
-	FreeXid(xid);
+	free_xid(xid);
 	return 0;
 }
 
-static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
+static int cifs_permission(struct inode *inode, int mask)
 {
 	struct cifs_sb_info *cifs_sb;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -311,7 +222,7 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask, flags, NULL);
+		return generic_permission(inode, mask);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
@@ -331,28 +242,35 @@ cifs_alloc_inode(struct super_block *sb)
 		return NULL;
 	cifs_inode->cifsAttrs = 0x20;	/* default */
 	cifs_inode->time = 0;
-	/* Until the file is open and we have gotten oplock
-	info back from the server, can not assume caching of
-	file data or metadata */
+	/*
+	 * Until the file is open and we have gotten oplock info back from the
+	 * server, can not assume caching of file data or metadata.
+	 */
 	cifs_set_oplock_level(cifs_inode, 0);
-	cifs_inode->delete_pending = false;
-	cifs_inode->invalid_mapping = false;
+	cifs_inode->flags = 0;
+	spin_lock_init(&cifs_inode->writers_lock);
+	cifs_inode->writers = 0;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->server_eof = 0;
 	cifs_inode->uniqueid = 0;
 	cifs_inode->createtime = 0;
-
-	/* Can not set i_flags here - they get immediately overwritten
-	   to zero by the VFS */
-/*	cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;*/
+	cifs_inode->epoch = 0;
+#ifdef CONFIG_CIFS_SMB2
+	get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
+#endif
+	/*
+	 * Can not set i_flags here - they get immediately overwritten to zero
+	 * by the VFS.
+	 */
+	/* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
 	INIT_LIST_HEAD(&cifs_inode->openFileList);
+	INIT_LIST_HEAD(&cifs_inode->llist);
 	return &cifs_inode->vfs_inode;
 }
 
 static void cifs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
 }
 
@@ -365,8 +283,8 @@ cifs_destroy_inode(struct inode *inode)
 static void
 cifs_evict_inode(struct inode *inode)
 {
-	truncate_inode_pages(&inode->i_data, 0);
-	end_writeback(inode);
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
 	cifs_fscache_release_inode_cookie(inode);
 }
 
@@ -376,7 +294,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 	struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
 	struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
 
-	seq_printf(s, ",addr=");
+	seq_puts(s, ",addr=");
 
 	switch (server->dstaddr.ss_family) {
 	case AF_INET:
@@ -388,29 +306,90 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 			seq_printf(s, "%%%u", sa6->sin6_scope_id);
 		break;
 	default:
-		seq_printf(s, "(unknown)");
+		seq_puts(s, "(unknown)");
 	}
 }
 
+static void
+cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
+{
+	if (ses->sectype == Unspecified)
+		return;
+
+	seq_puts(s, ",sec=");
+
+	switch (ses->sectype) {
+	case LANMAN:
+		seq_puts(s, "lanman");
+		break;
+	case NTLMv2:
+		seq_puts(s, "ntlmv2");
+		break;
+	case NTLM:
+		seq_puts(s, "ntlm");
+		break;
+	case Kerberos:
+		seq_puts(s, "krb5");
+		break;
+	case RawNTLMSSP:
+		seq_puts(s, "ntlmssp");
+		break;
+	default:
+		/* shouldn't ever happen */
+		seq_puts(s, "unknown");
+		break;
+	}
+
+	if (ses->sign)
+		seq_puts(s, "i");
+}
+
+static void
+cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
+{
+	seq_puts(s, ",cache=");
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+		seq_puts(s, "strict");
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+		seq_puts(s, "none");
+	else
+		seq_puts(s, "loose");
+}
+
+static void
+cifs_show_nls(struct seq_file *s, struct nls_table *cur)
+{
+	struct nls_table *def;
+
+	/* Display iocharset= option if it's not default charset */
+	def = load_nls_default();
+	if (def != cur)
+		seq_printf(s, ",iocharset=%s", cur->charset);
+	unload_nls(def);
+}
+
 /*
  * cifs_show_options() is for displaying mount options in /proc/mounts.
  * Not all settable options are displayed but most of the important
  * ones are.
  */
 static int
-cifs_show_options(struct seq_file *s, struct vfsmount *m)
+cifs_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
-	seq_printf(s, ",unc=%s", tcon->treeName);
+	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+	cifs_show_security(s, tcon->ses);
+	cifs_show_cache_flavor(s, cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
-		seq_printf(s, ",multiuser");
-	else if (tcon->ses->userName)
-		seq_printf(s, ",username=%s", tcon->ses->userName);
+		seq_puts(s, ",multiuser");
+	else if (tcon->ses->user_name)
+		seq_printf(s, ",username=%s", tcon->ses->user_name);
 
 	if (tcon->ses->domainName)
 		seq_printf(s, ",domain=%s", tcon->ses->domainName);
@@ -431,63 +410,84 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 				   (int)(srcaddr->sa_family));
 	}
 
-	seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
+	seq_printf(s, ",uid=%u",
+		   from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-		seq_printf(s, ",forceuid");
+		seq_puts(s, ",forceuid");
 	else
-		seq_printf(s, ",noforceuid");
+		seq_puts(s, ",noforceuid");
 
-	seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
+	seq_printf(s, ",gid=%u",
+		   from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-		seq_printf(s, ",forcegid");
+		seq_puts(s, ",forcegid");
 	else
-		seq_printf(s, ",noforcegid");
+		seq_puts(s, ",noforcegid");
 
 	cifs_show_address(s, tcon->ses->server);
 
 	if (!tcon->unix_ext)
-		seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
+		seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
 					   cifs_sb->mnt_file_mode,
 					   cifs_sb->mnt_dir_mode);
+
+	cifs_show_nls(s, cifs_sb->local_nls);
+
 	if (tcon->seal)
-		seq_printf(s, ",seal");
+		seq_puts(s, ",seal");
 	if (tcon->nocase)
-		seq_printf(s, ",nocase");
+		seq_puts(s, ",nocase");
 	if (tcon->retry)
-		seq_printf(s, ",hard");
-	if (cifs_sb->prepath)
-		seq_printf(s, ",prepath=%s", cifs_sb->prepath);
+		seq_puts(s, ",hard");
+	if (tcon->unix_ext)
+		seq_puts(s, ",unix");
+	else
+		seq_puts(s, ",nounix");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-		seq_printf(s, ",posixpaths");
+		seq_puts(s, ",posixpaths");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
-		seq_printf(s, ",setuids");
+		seq_puts(s, ",setuids");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-		seq_printf(s, ",serverino");
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
-		seq_printf(s, ",directio");
+		seq_puts(s, ",serverino");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		seq_puts(s, ",rwpidforward");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
+		seq_puts(s, ",forcemand");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
-		seq_printf(s, ",nouser_xattr");
+		seq_puts(s, ",nouser_xattr");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
-		seq_printf(s, ",mapchars");
+		seq_puts(s, ",mapchars");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
-		seq_printf(s, ",sfu");
+		seq_puts(s, ",sfu");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-		seq_printf(s, ",nobrl");
+		seq_puts(s, ",nobrl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
-		seq_printf(s, ",cifsacl");
+		seq_puts(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
-		seq_printf(s, ",dynperm");
-	if (m->mnt_sb->s_flags & MS_POSIXACL)
-		seq_printf(s, ",acl");
+		seq_puts(s, ",dynperm");
+	if (root->d_sb->s_flags & MS_POSIXACL)
+		seq_puts(s, ",acl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		seq_printf(s, ",mfsymlinks");
+		seq_puts(s, ",mfsymlinks");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
-		seq_printf(s, ",fsc");
-
-	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
-	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+		seq_puts(s, ",fsc");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
+		seq_puts(s, ",nostrictsync");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		seq_puts(s, ",noperm");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
+		seq_printf(s, ",backupuid=%u",
+			   from_kuid_munged(&init_user_ns,
+					    cifs_sb->mnt_backupuid));
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
+		seq_printf(s, ",backupgid=%u",
+			   from_kgid_munged(&init_user_ns,
+					    cifs_sb->mnt_backupgid));
+
+	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
+	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
 	/* convert actimeo and display it in seconds */
-		seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
+	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
 	return 0;
 }
@@ -495,7 +495,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 static void cifs_umount_begin(struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 
 	if (cifs_sb == NULL)
 		return;
@@ -516,7 +516,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
 	if (tcon->ses && tcon->ses->server) {
-		cFYI(1, "wake up tasks now - umount begin not complete");
+		cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n");
 		wake_up_all(&tcon->ses->server->request_q);
 		wake_up_all(&tcon->ses->server->response_q);
 		msleep(1); /* yield */
@@ -529,7 +529,7 @@ static void cifs_umount_begin(struct super_block *sb)
 }
 
 #ifdef CONFIG_CIFS_STATS2
-static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
+static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 {
 	/* BB FIXME */
 	return 0;
@@ -538,6 +538,7 @@ static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
 
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NODIRATIME;
 	return 0;
 }
@@ -552,7 +553,6 @@ static int cifs_drop_inode(struct inode *inode)
 }
 
 static const struct super_operations cifs_super_ops = {
-	.put_super = cifs_put_super,
 	.statfs = cifs_statfs,
 	.alloc_inode = cifs_alloc_inode,
 	.destroy_inode = cifs_destroy_inode,
@@ -570,93 +570,262 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
 
+/*
+ * Get root dentry from superblock according to prefix path mount option.
+ * Return dentry with refcount + 1 on success and NULL otherwise.
+ */
+static struct dentry *
+cifs_get_root(struct smb_vol *vol, struct super_block *sb)
+{
+	struct dentry *dentry;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	char *full_path = NULL;
+	char *s, *p;
+	char sep;
+
+	full_path = cifs_build_path_to_root(vol, cifs_sb,
+					    cifs_sb_master_tcon(cifs_sb));
+	if (full_path == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	cifs_dbg(FYI, "Get root dentry for %s\n", full_path);
+
+	sep = CIFS_DIR_SEP(cifs_sb);
+	dentry = dget(sb->s_root);
+	p = s = full_path;
+
+	do {
+		struct inode *dir = dentry->d_inode;
+		struct dentry *child;
+
+		if (!dir) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOENT);
+			break;
+		}
+		if (!S_ISDIR(dir->i_mode)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOTDIR);
+			break;
+		}
+
+		/* skip separators */
+		while (*s == sep)
+			s++;
+		if (!*s)
+			break;
+		p = s++;
+		/* next separator */
+		while (*s && *s != sep)
+			s++;
+
+		mutex_lock(&dir->i_mutex);
+		child = lookup_one_len(p, dentry, s - p);
+		mutex_unlock(&dir->i_mutex);
+		dput(dentry);
+		dentry = child;
+	} while (!IS_ERR(dentry));
+	kfree(full_path);
+	return dentry;
+}
+
+static int cifs_set_super(struct super_block *sb, void *data)
+{
+	struct cifs_mnt_data *mnt_data = data;
+	sb->s_fs_info = mnt_data->cifs_sb;
+	return set_anon_super(sb, NULL);
+}
+
 static struct dentry *
 cifs_do_mount(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data)
+	      int flags, const char *dev_name, void *data)
 {
 	int rc;
 	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
+	struct smb_vol *volume_info;
+	struct cifs_mnt_data mnt_data;
+	struct dentry *root;
+
+	cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
 
-	sb = sget(fs_type, NULL, set_anon_super, NULL);
+	volume_info = cifs_get_volume_info((char *)data, dev_name);
+	if (IS_ERR(volume_info))
+		return ERR_CAST(volume_info);
 
-	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
+	cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL);
+	if (cifs_sb == NULL) {
+		root = ERR_PTR(-ENOMEM);
+		goto out_nls;
+	}
 
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
+	cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
+	if (cifs_sb->mountdata == NULL) {
+		root = ERR_PTR(-ENOMEM);
+		goto out_cifs_sb;
+	}
 
-	sb->s_flags = flags;
+	cifs_setup_cifs_sb(volume_info, cifs_sb);
 
-	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
+	rc = cifs_mount(cifs_sb, volume_info);
 	if (rc) {
-		deactivate_locked_super(sb);
-		return ERR_PTR(rc);
+		if (!(flags & MS_SILENT))
+			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
+				 rc);
+		root = ERR_PTR(rc);
+		goto out_mountdata;
 	}
-	sb->s_flags |= MS_ACTIVE;
-	return dget(sb->s_root);
+
+	mnt_data.vol = volume_info;
+	mnt_data.cifs_sb = cifs_sb;
+	mnt_data.flags = flags;
+
+	/* BB should we make this contingent on mount parm? */
+	flags |= MS_NODIRATIME | MS_NOATIME;
+
+	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
+	if (IS_ERR(sb)) {
+		root = ERR_CAST(sb);
+		cifs_umount(cifs_sb);
+		goto out;
+	}
+
+	if (sb->s_root) {
+		cifs_dbg(FYI, "Use existing superblock\n");
+		cifs_umount(cifs_sb);
+	} else {
+		rc = cifs_read_super(sb);
+		if (rc) {
+			root = ERR_PTR(rc);
+			goto out_super;
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	root = cifs_get_root(volume_info, sb);
+	if (IS_ERR(root))
+		goto out_super;
+
+	cifs_dbg(FYI, "dentry root is: %p\n", root);
+	goto out;
+
+out_super:
+	deactivate_locked_super(sb);
+out:
+	cifs_cleanup_volume_info(volume_info);
+	return root;
+
+out_mountdata:
+	kfree(cifs_sb->mountdata);
+out_cifs_sb:
+	kfree(cifs_sb);
+out_nls:
+	unload_nls(volume_info->local_nls);
+	goto out;
 }
 
-static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				   unsigned long nr_segs, loff_t pos)
+static ssize_t
+cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	ssize_t rc;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	rc = cifs_revalidate_mapping(inode);
+	if (rc)
+		return rc;
+
+	return generic_file_read_iter(iocb, iter);
+}
+
+static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	ssize_t written;
 	int rc;
 
-	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	if (CIFS_I(inode)->clientCanCacheAll)
+	written = cifs_get_writer(cinode);
+	if (written)
 		return written;
 
+	written = generic_file_write_iter(iocb, from);
+
+	if (CIFS_CACHE_WRITE(CIFS_I(inode)))
+		goto out;
+
 	rc = filemap_fdatawrite(inode->i_mapping);
 	if (rc)
-		cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
+		cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
+			 rc, inode);
 
+out:
+	cifs_put_writer(cinode);
 	return written;
 }
 
-static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
+static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 {
-	/* origin == SEEK_END => we must revalidate the cached file length */
-	if (origin == SEEK_END) {
-		int retval;
-
-		/* some applications poll for the file length in this strange
-		   way so we must seek to end on non-oplocked files by
-		   setting the revalidate time to zero */
-		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
-
-		retval = cifs_revalidate_file(file);
-		if (retval < 0)
-			return (loff_t)retval;
+	/*
+	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+	 * the cached file length
+	 */
+	if (whence != SEEK_SET && whence != SEEK_CUR) {
+		int rc;
+		struct inode *inode = file_inode(file);
+
+		/*
+		 * We need to be sure that all dirty pages are written and the
+		 * server has the newest file length.
+		 */
+		if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
+		    inode->i_mapping->nrpages != 0) {
+			rc = filemap_fdatawait(inode->i_mapping);
+			if (rc) {
+				mapping_set_error(inode->i_mapping, rc);
+				return rc;
+			}
+		}
+		/*
+		 * Some applications poll for the file length in this strange
+		 * way so we must seek to end on non-oplocked files by
+		 * setting the revalidate time to zero.
+		 */
+		CIFS_I(inode)->time = 0;
+
+		rc = cifs_revalidate_file_attr(file);
+		if (rc < 0)
+			return (loff_t)rc;
 	}
-	return generic_file_llseek_unlocked(file, offset, origin);
+	return generic_file_llseek(file, offset, whence);
 }
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-	/* note that this is called by vfs setlease with lock_flocks held
-	   to protect *lease from going away */
-	struct inode *inode = file->f_path.dentry->d_inode;
+	/*
+	 * Note that this is called by vfs setlease with i_lock held to
+	 * protect *lease from going away.
+	 */
+	struct inode *inode = file_inode(file);
 	struct cifsFileInfo *cfile = file->private_data;
 
 	if (!(S_ISREG(inode->i_mode)))
 		return -EINVAL;
 
 	/* check if file is oplocked */
-	if (((arg == F_RDLCK) &&
-		(CIFS_I(inode)->clientCanCacheRead)) ||
-	    ((arg == F_WRLCK) &&
-		(CIFS_I(inode)->clientCanCacheAll)))
+	if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
+	    ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
 		return generic_setlease(file, arg, lease);
 	else if (tlink_tcon(cfile->tlink)->local_lease &&
-		 !CIFS_I(inode)->clientCanCacheRead)
-		/* If the server claims to support oplock on this
-		   file, then we still need to check oplock even
-		   if the local_lease mount option is set, but there
-		   are servers which do not support oplock for which
-		   this mount option may be useful if the user
-		   knows that the file won't be changed on the server
-		   by anyone else */
+		 !CIFS_CACHE_READ(CIFS_I(inode)))
+		/*
+		 * If the server claims to support oplock on this file, then we
+		 * still need to check oplock even if the local_lease mount
+		 * option is set, but there are servers which do not support
+		 * oplock for which this mount option may be useful if the user
+		 * knows that the file won't be changed on the server by anyone
+		 * else.
+		 */
 		return generic_setlease(file, arg, lease);
 	else
 		return -EAGAIN;
@@ -666,11 +835,13 @@ struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "cifs",
 	.mount = cifs_do_mount,
-	.kill_sb = kill_anon_super,
+	.kill_sb = cifs_kill_sb,
 	/*  .fs_flags */
 };
+MODULE_ALIAS_FS("cifs");
 const struct inode_operations cifs_dir_inode_ops = {
 	.create = cifs_create,
+	.atomic_open = cifs_atomic_open,
 	.lookup = cifs_lookup,
 	.getattr = cifs_getattr,
 	.unlink = cifs_unlink,
@@ -695,7 +866,6 @@ const struct inode_operations cifs_file_inode_ops = {
 /*	revalidate:cifs_revalidate, */
 	.setattr = cifs_setattr,
 	.getattr = cifs_getattr, /* do we need this anymore? */
-	.rename = cifs_rename,
 	.permission = cifs_permission,
 #ifdef CONFIG_CIFS_XATTR
 	.setxattr = cifs_setxattr,
@@ -708,7 +878,7 @@ const struct inode_operations cifs_file_inode_ops = {
 const struct inode_operations cifs_symlink_inode_ops = {
 	.readlink = generic_readlink,
 	.follow_link = cifs_follow_link,
-	.put_link = cifs_put_link,
+	.put_link = kfree_put_link,
 	.permission = cifs_permission,
 	/* BB add the following two eventually */
 	/* revalidate: cifs_revalidate,
@@ -722,10 +892,10 @@ const struct inode_operations cifs_symlink_inode_ops = {
 };
 
 const struct file_operations cifs_file_ops = {
-	.read = do_sync_read,
-	.write = do_sync_write,
-	.aio_read = generic_file_aio_read,
-	.aio_write = cifs_file_aio_write,
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = cifs_loose_read_iter,
+	.write_iter = cifs_file_write_iter,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -741,10 +911,10 @@ const struct file_operations cifs_file_ops = {
 };
 
 const struct file_operations cifs_file_strict_ops = {
-	.read = do_sync_read,
-	.write = do_sync_write,
-	.aio_read = cifs_strict_readv,
-	.aio_write = cifs_strict_writev,
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = cifs_strict_readv,
+	.write_iter = cifs_strict_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -760,10 +930,11 @@ const struct file_operations cifs_file_strict_ops = {
 };
 
 const struct file_operations cifs_file_direct_ops = {
-	/* no aio, no readv -
-	   BB reevaluate whether they can be done with directio, no cache */
-	.read = cifs_user_read,
-	.write = cifs_user_write,
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = cifs_user_readv,
+	.write_iter = cifs_user_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -779,10 +950,10 @@ const struct file_operations cifs_file_direct_ops = {
 };
 
 const struct file_operations cifs_file_nobrl_ops = {
-	.read = do_sync_read,
-	.write = do_sync_write,
-	.aio_read = generic_file_aio_read,
-	.aio_write = cifs_file_aio_write,
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = cifs_loose_read_iter,
+	.write_iter = cifs_file_write_iter,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_fsync,
@@ -797,10 +968,10 @@ const struct file_operations cifs_file_nobrl_ops = {
 };
 
 const struct file_operations cifs_file_strict_nobrl_ops = {
-	.read = do_sync_read,
-	.write = do_sync_write,
-	.aio_read = cifs_strict_readv,
-	.aio_write = cifs_strict_writev,
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = cifs_strict_readv,
+	.write_iter = cifs_strict_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_strict_fsync,
@@ -815,10 +986,11 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-	/* no mmap, no aio, no readv -
-	   BB reevaluate whether they can be done with directio, no cache */
-	.read = cifs_user_read,
-	.write = cifs_user_write,
+	/* BB reevaluate whether they can be done with directio, no cache */
+	.read = new_sync_read,
+	.write = new_sync_write,
+	.read_iter = cifs_user_readv,
+	.write_iter = cifs_user_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_fsync,
@@ -833,7 +1005,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 };
 
 const struct file_operations cifs_dir_ops = {
-	.readdir = cifs_readdir,
+	.iterate = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
 	.unlocked_ioctl  = cifs_ioctl,
@@ -846,10 +1018,10 @@ cifs_init_once(void *inode)
 	struct cifsInodeInfo *cifsi = inode;
 
 	inode_init_once(&cifsi->vfs_inode);
-	INIT_LIST_HEAD(&cifsi->lockList);
+	init_rwsem(&cifsi->lock_sem);
 }
 
-static int
+static int __init
 cifs_init_inodecache(void)
 {
 	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
@@ -866,12 +1038,25 @@ cifs_init_inodecache(void)
 static void
 cifs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(cifs_inode_cachep);
 }
 
 static int
 cifs_init_request_bufs(void)
 {
+	size_t max_hdr_size = MAX_CIFS_HDR_SIZE;
+#ifdef CONFIG_CIFS_SMB2
+	/*
+	 * SMB2 maximum header size is bigger than CIFS one - no problems to
+	 * allocate some more bytes for CIFS.
+	 */
+	max_hdr_size = MAX_SMB2_HDR_SIZE;
+#endif
 	if (CIFSMaxBufSize < 8192) {
 	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
 	Unicode path name has to fit in any SMB/CIFS path based frames */
@@ -881,10 +1066,12 @@ cifs_init_request_bufs(void)
 	} else {
 		CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
 	}
-/*	cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
+/*
+	cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n",
+		 CIFSMaxBufSize, CIFSMaxBufSize);
+*/
 	cifs_req_cachep = kmem_cache_create("cifs_request",
-					    CIFSMaxBufSize +
-					    MAX_CIFS_HDR_SIZE, 0,
+					    CIFSMaxBufSize + max_hdr_size, 0,
 					    SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_req_cachep == NULL)
 		return -ENOMEM;
@@ -893,7 +1080,7 @@ cifs_init_request_bufs(void)
 		cifs_min_rcv = 1;
 	else if (cifs_min_rcv > 64) {
 		cifs_min_rcv = 64;
-		cERROR(1, "cifs_min_rcv set to maximum (64)");
+		cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n");
 	}
 
 	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -924,7 +1111,7 @@ cifs_init_request_bufs(void)
 		cifs_min_small = 2;
 	else if (cifs_min_small > 256) {
 		cifs_min_small = 256;
-		cFYI(1, "cifs_min_small set to maximum (256)");
+		cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n");
 	}
 
 	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -981,10 +1168,10 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
 	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
-#endif
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 /*
  *  Initialize Global counters
  */
@@ -1011,15 +1198,22 @@ init_cifs(void)
 
 	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
-		cFYI(1, "cifs_max_pending set to min of 2");
-	} else if (cifs_max_pending > 256) {
-		cifs_max_pending = 256;
-		cFYI(1, "cifs_max_pending set to max of 256");
+		cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
+	} else if (cifs_max_pending > CIFS_MAX_REQ) {
+		cifs_max_pending = CIFS_MAX_REQ;
+		cifs_dbg(FYI, "cifs_max_pending set to max of %u\n",
+			 CIFS_MAX_REQ);
+	}
+
+	cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+	if (!cifsiod_wq) {
+		rc = -ENOMEM;
+		goto out_clean_proc;
 	}
 
 	rc = cifs_fscache_register();
 	if (rc)
-		goto out_clean_proc;
+		goto out_destroy_wq;
 
 	rc = cifs_init_inodecache();
 	if (rc)
@@ -1033,22 +1227,33 @@ init_cifs(void)
 	if (rc)
 		goto out_destroy_mids;
 
-	rc = register_filesystem(&cifs_fs_type);
-	if (rc)
-		goto out_destroy_request_bufs;
 #ifdef CONFIG_CIFS_UPCALL
 	rc = register_key_type(&cifs_spnego_key_type);
 	if (rc)
-		goto out_unregister_filesystem;
-#endif
+		goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_UPCALL */
+
+#ifdef CONFIG_CIFS_ACL
+	rc = init_cifs_idmap();
+	if (rc)
+		goto out_register_key_type;
+#endif /* CONFIG_CIFS_ACL */
+
+	rc = register_filesystem(&cifs_fs_type);
+	if (rc)
+		goto out_init_cifs_idmap;
 
 	return 0;
 
-#ifdef CONFIG_CIFS_UPCALL
-out_unregister_filesystem:
-	unregister_filesystem(&cifs_fs_type);
+out_init_cifs_idmap:
+#ifdef CONFIG_CIFS_ACL
+	exit_cifs_idmap();
+out_register_key_type:
 #endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
 out_destroy_request_bufs:
+#endif
 	cifs_destroy_request_bufs();
 out_destroy_mids:
 	cifs_destroy_mids();
@@ -1056,6 +1261,8 @@ out_destroy_inodecache:
 	cifs_destroy_inodecache();
 out_unreg_fscache:
 	cifs_fscache_unregister();
+out_destroy_wq:
+	destroy_workqueue(cifsiod_wq);
 out_clean_proc:
 	cifs_proc_clean();
 	return rc;
@@ -1064,19 +1271,21 @@ out_clean_proc:
 static void __exit
 exit_cifs(void)
 {
-	cFYI(DBG2, "exit_cifs");
-	cifs_proc_clean();
-	cifs_fscache_unregister();
-#ifdef CONFIG_CIFS_DFS_UPCALL
+	cifs_dbg(NOISY, "exit_cifs\n");
+	unregister_filesystem(&cifs_fs_type);
 	cifs_dfs_release_automount_timer();
+#ifdef CONFIG_CIFS_ACL
+	exit_cifs_idmap();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
 #endif
-	unregister_filesystem(&cifs_fs_type);
-	cifs_destroy_inodecache();
-	cifs_destroy_mids();
 	cifs_destroy_request_bufs();
+	cifs_destroy_mids();
+	cifs_destroy_inodecache();
+	cifs_fscache_unregister();
+	destroy_workqueue(cifsiod_wq);
+	cifs_proc_clean();
 }
 
 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a9371b6578c..70f178a7c75 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -22,20 +22,28 @@
 #ifndef _CIFSFS_H
 #define _CIFSFS_H
 
+#include <linux/hash.h>
+
 #define ROOT_I 2
 
 /*
  * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
- * so that it will fit.
+ * so that it will fit. We use hash_64 to convert the value to 31 bits, and
+ * then add 1, to ensure that we don't end up with a 0 as the value.
  */
+#if BITS_PER_LONG == 64
 static inline ino_t
 cifs_uniqueid_to_ino_t(u64 fileid)
 {
-	ino_t ino = (ino_t) fileid;
-	if (sizeof(ino_t) < sizeof(u64))
-		ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
-	return ino;
+	return (ino_t)fileid;
 }
+#else
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+	return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+}
+#endif
 
 extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
@@ -47,21 +55,28 @@ extern void cifs_sb_deactive(struct super_block *sb);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
-extern int cifs_create(struct inode *, struct dentry *, int,
-		       struct nameidata *);
+extern struct inode *cifs_root_iget(struct super_block *);
+extern int cifs_create(struct inode *, struct dentry *, umode_t,
+		       bool excl);
+extern int cifs_atomic_open(struct inode *, struct dentry *,
+			    struct file *, unsigned, umode_t,
+			    int *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
-				  struct nameidata *);
+				  unsigned int);
 extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
-extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
-extern int cifs_mkdir(struct inode *, struct dentry *, int);
+extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
+extern int cifs_revalidate_file_attr(struct file *filp);
+extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
-extern void cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_revalidate_mapping(struct inode *inode);
+extern int cifs_zap_mapping(struct inode *inode);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
@@ -80,23 +95,19 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
-			      size_t read_size, loff_t *poffset);
-extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-			       size_t write_size, loff_t *poffset);
-extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
-				  unsigned long nr_segs, loff_t pos);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, int);
-extern int cifs_strict_fsync(struct file *, int);
+extern int cifs_fsync(struct file *, loff_t, loff_t, int);
+extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
 
 /* Functions related to dir entries */
 extern const struct dentry_operations cifs_dentry_ops;
@@ -110,8 +121,6 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
-extern void cifs_put_link(struct dentry *direntry,
-			  struct nameidata *nd, void *);
 extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 			 int buflen);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
@@ -123,9 +132,9 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
-#endif /* EXPERIMENTAL */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "1.71"
+#define CIFS_VERSION   "2.03"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 17afb0fbcae..de6aed8c78e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -22,11 +22,18 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
+#include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
 #include <crypto/internal/hash.h>
 #include <linux/scatterlist.h>
+#include <uapi/linux/cifs/cifs_mount.h>
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2pdu.h"
+#endif
+
+#define CIFS_MAGIC_NUMBER 0xFF534D42      /* the first four bytes of SMB PDUs */
 
 /*
  * The sizes of various internal tables and strings
@@ -35,15 +42,11 @@
 #define MAX_SES_INFO 2
 #define MAX_TCON_INFO 4
 
-#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
-#define MAX_SERVER_SIZE 15
-#define MAX_SHARE_SIZE  64	/* used to be 20, this should still be enough */
-#define MAX_USERNAME_SIZE 32	/* 32 is to allow for 15 char names + null
-				   termination then *2 for unicode versions */
-#define MAX_PASSWORD_SIZE 512  /* max for windows seems to be 256 wide chars */
+#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1)
 
 #define CIFS_MIN_RCV_POOL 4
 
+#define MAX_REOPEN_ATT	5 /* these many maximum attempts to reopen a file */
 /*
  * default attribute cache timeout (jiffies)
  */
@@ -56,14 +59,9 @@
 
 /*
  * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurrently. It also matches the most common
- * value of max multiplex returned by servers.  We may
- * eventually want to use the negotiated value (in case
- * future servers can handle more) when we are more confident that
- * we will not have problems oveloading the socket with pending
- * write data.
+ * on one socket concurrently.
  */
-#define CIFS_MAX_REQ 50
+#define CIFS_MAX_REQ 32767
 
 #define RFC1001_NAME_LEN 15
 #define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
@@ -77,6 +75,9 @@
 /*           (max path length + 1 for null) * 2 for unicode    */
 #define MAX_NAME 514
 
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
 #include "cifspdu.h"
 
 #ifndef XATTR_DOS_ATTRIB
@@ -92,24 +93,19 @@ enum statusEnum {
 	CifsNew = 0,
 	CifsGood,
 	CifsExiting,
-	CifsNeedReconnect
+	CifsNeedReconnect,
+	CifsNeedNegotiate
 };
 
 enum securityEnum {
-	LANMAN = 0,			/* Legacy LANMAN auth */
+	Unspecified = 0,	/* not specified */
+	LANMAN,			/* Legacy LANMAN auth */
 	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
-/*	NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
 	Kerberos,		/* Kerberos via SPNEGO */
 };
 
-enum protocolEnum {
-	TCP = 0,
-	SCTP
-	/* Netbios frames protocol not supported at this time */
-};
-
 struct session_key {
 	unsigned int len;
 	char *response;
@@ -125,12 +121,17 @@ struct sdesc {
 struct cifs_secmech {
 	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
 	struct crypto_shash *md5; /* md5 hash function */
+	struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
+	struct crypto_shash *cmacaes; /* block-cipher based MAC function */
 	struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
 	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+	struct sdesc *sdeschmacsha256;  /* ctxt to generate smb2 signature */
+	struct sdesc *sdesccmacaes;  /* ctxt to generate smb3 signature */
 };
 
 /* per smb session structure/fields */
 struct ntlmssp_auth {
+	bool sesskey_per_smbsess; /* whether session key is per smb session */
 	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
 	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
 	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
@@ -155,12 +156,382 @@ struct cifs_cred {
  *****************************************************************
  */
 
+/*
+ * A smb_rqst represents a complete request to be issued to a server. It's
+ * formed by a kvec array, followed by an array of pages. Page data is assumed
+ * to start at the beginning of the first page.
+ */
+struct smb_rqst {
+	struct kvec	*rq_iov;	/* array of kvecs */
+	unsigned int	rq_nvec;	/* number of kvecs in array */
+	struct page	**rq_pages;	/* pointer to array of page ptrs */
+	unsigned int	rq_npages;	/* number pages in array */
+	unsigned int	rq_pagesz;	/* page size to use */
+	unsigned int	rq_tailsz;	/* length of last page */
+};
+
+enum smb_version {
+	Smb_1 = 1,
+	Smb_20,
+	Smb_21,
+	Smb_30,
+	Smb_302,
+};
+
+struct mid_q_entry;
+struct TCP_Server_Info;
+struct cifsFileInfo;
+struct cifs_ses;
+struct cifs_tcon;
+struct dfs_info3_param;
+struct cifs_fattr;
+struct smb_vol;
+struct cifs_fid;
+struct cifs_readdata;
+struct cifs_writedata;
+struct cifs_io_parms;
+struct cifs_search_info;
+struct cifsInodeInfo;
+struct cifs_open_parms;
+
+struct smb_version_operations {
+	int (*send_cancel)(struct TCP_Server_Info *, void *,
+			   struct mid_q_entry *);
+	bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
+	/* setup request: allocate mid, sign message */
+	struct mid_q_entry *(*setup_request)(struct cifs_ses *,
+						struct smb_rqst *);
+	/* setup async request: allocate mid, sign message */
+	struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *,
+						struct smb_rqst *);
+	/* check response: verify signature, map error */
+	int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
+			     bool);
+	void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
+			    const int);
+	void (*set_credits)(struct TCP_Server_Info *, const int);
+	int * (*get_credits_field)(struct TCP_Server_Info *, const int);
+	unsigned int (*get_credits)(struct mid_q_entry *);
+	__u64 (*get_next_mid)(struct TCP_Server_Info *);
+	/* data offset from read response message */
+	unsigned int (*read_data_offset)(char *);
+	/* data length from read response message */
+	unsigned int (*read_data_length)(char *);
+	/* map smb to linux error */
+	int (*map_error)(char *, bool);
+	/* find mid corresponding to the response message */
+	struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
+	void (*dump_detail)(void *);
+	void (*clear_stats)(struct cifs_tcon *);
+	void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
+	void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
+	/* verify the message */
+	int (*check_message)(char *, unsigned int);
+	bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+	void (*downgrade_oplock)(struct TCP_Server_Info *,
+					struct cifsInodeInfo *, bool);
+	/* process transaction2 response */
+	bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
+			     char *, int);
+	/* check if we need to negotiate */
+	bool (*need_neg)(struct TCP_Server_Info *);
+	/* negotiate to the server */
+	int (*negotiate)(const unsigned int, struct cifs_ses *);
+	/* set negotiated write size */
+	unsigned int (*negotiate_wsize)(struct cifs_tcon *, struct smb_vol *);
+	/* set negotiated read size */
+	unsigned int (*negotiate_rsize)(struct cifs_tcon *, struct smb_vol *);
+	/* setup smb sessionn */
+	int (*sess_setup)(const unsigned int, struct cifs_ses *,
+			  const struct nls_table *);
+	/* close smb session */
+	int (*logoff)(const unsigned int, struct cifs_ses *);
+	/* connect to a server share */
+	int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *,
+			    struct cifs_tcon *, const struct nls_table *);
+	/* close tree connecion */
+	int (*tree_disconnect)(const unsigned int, struct cifs_tcon *);
+	/* get DFS referrals */
+	int (*get_dfs_refer)(const unsigned int, struct cifs_ses *,
+			     const char *, struct dfs_info3_param **,
+			     unsigned int *, const struct nls_table *, int);
+	/* informational QFS call */
+	void (*qfs_tcon)(const unsigned int, struct cifs_tcon *);
+	/* check if a path is accessible or not */
+	int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
+				  struct cifs_sb_info *, const char *);
+	/* query path data from the server */
+	int (*query_path_info)(const unsigned int, struct cifs_tcon *,
+			       struct cifs_sb_info *, const char *,
+			       FILE_ALL_INFO *, bool *, bool *);
+	/* query file data from the server */
+	int (*query_file_info)(const unsigned int, struct cifs_tcon *,
+			       struct cifs_fid *, FILE_ALL_INFO *);
+	/* get server index number */
+	int (*get_srv_inum)(const unsigned int, struct cifs_tcon *,
+			    struct cifs_sb_info *, const char *,
+			    u64 *uniqueid, FILE_ALL_INFO *);
+	/* set size by path */
+	int (*set_path_size)(const unsigned int, struct cifs_tcon *,
+			     const char *, __u64, struct cifs_sb_info *, bool);
+	/* set size by file handle */
+	int (*set_file_size)(const unsigned int, struct cifs_tcon *,
+			     struct cifsFileInfo *, __u64, bool);
+	/* set attributes */
+	int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
+			     const unsigned int);
+	int (*set_compression)(const unsigned int, struct cifs_tcon *,
+			       struct cifsFileInfo *);
+	/* check if we can send an echo or nor */
+	bool (*can_echo)(struct TCP_Server_Info *);
+	/* send echo request */
+	int (*echo)(struct TCP_Server_Info *);
+	/* create directory */
+	int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
+		     struct cifs_sb_info *);
+	/* set info on created directory */
+	void (*mkdir_setinfo)(struct inode *, const char *,
+			      struct cifs_sb_info *, struct cifs_tcon *,
+			      const unsigned int);
+	/* remove directory */
+	int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *,
+		     struct cifs_sb_info *);
+	/* unlink file */
+	int (*unlink)(const unsigned int, struct cifs_tcon *, const char *,
+		      struct cifs_sb_info *);
+	/* open, rename and delete file */
+	int (*rename_pending_delete)(const char *, struct dentry *,
+				     const unsigned int);
+	/* send rename request */
+	int (*rename)(const unsigned int, struct cifs_tcon *, const char *,
+		      const char *, struct cifs_sb_info *);
+	/* send create hardlink request */
+	int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
+			       const char *, const char *,
+			       struct cifs_sb_info *);
+	/* query symlink target */
+	int (*query_symlink)(const unsigned int, struct cifs_tcon *,
+			     const char *, char **, struct cifs_sb_info *);
+	/* open a file for non-posix mounts */
+	int (*open)(const unsigned int, struct cifs_open_parms *,
+		    __u32 *, FILE_ALL_INFO *);
+	/* set fid protocol-specific info */
+	void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
+	/* close a file */
+	void (*close)(const unsigned int, struct cifs_tcon *,
+		      struct cifs_fid *);
+	/* send a flush request to the server */
+	int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
+	/* async read from the server */
+	int (*async_readv)(struct cifs_readdata *);
+	/* async write to the server */
+	int (*async_writev)(struct cifs_writedata *,
+			    void (*release)(struct kref *));
+	/* sync read from the server */
+	int (*sync_read)(const unsigned int, struct cifsFileInfo *,
+			 struct cifs_io_parms *, unsigned int *, char **,
+			 int *);
+	/* sync write to the server */
+	int (*sync_write)(const unsigned int, struct cifsFileInfo *,
+			  struct cifs_io_parms *, unsigned int *, struct kvec *,
+			  unsigned long);
+	/* open dir, start readdir */
+	int (*query_dir_first)(const unsigned int, struct cifs_tcon *,
+			       const char *, struct cifs_sb_info *,
+			       struct cifs_fid *, __u16,
+			       struct cifs_search_info *);
+	/* continue readdir */
+	int (*query_dir_next)(const unsigned int, struct cifs_tcon *,
+			      struct cifs_fid *,
+			      __u16, struct cifs_search_info *srch_inf);
+	/* close dir */
+	int (*close_dir)(const unsigned int, struct cifs_tcon *,
+			 struct cifs_fid *);
+	/* calculate a size of SMB message */
+	unsigned int (*calc_smb_size)(void *);
+	/* check for STATUS_PENDING and process it in a positive case */
+	bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+	/* send oplock break response */
+	int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *,
+			       struct cifsInodeInfo *);
+	/* query remote filesystem */
+	int (*queryfs)(const unsigned int, struct cifs_tcon *,
+		       struct kstatfs *);
+	/* send mandatory brlock to the server */
+	int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64,
+			 __u64, __u32, int, int, bool);
+	/* unlock range of mandatory locks */
+	int (*mand_unlock_range)(struct cifsFileInfo *, struct file_lock *,
+				 const unsigned int);
+	/* push brlocks from the cache to the server */
+	int (*push_mand_locks)(struct cifsFileInfo *);
+	/* get lease key of the inode */
+	void (*get_lease_key)(struct inode *, struct cifs_fid *);
+	/* set lease key of the inode */
+	void (*set_lease_key)(struct inode *, struct cifs_fid *);
+	/* generate new lease key */
+	void (*new_lease_key)(struct cifs_fid *);
+	int (*generate_signingkey)(struct cifs_ses *);
+	int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+	int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
+				struct cifs_sb_info *, const unsigned char *,
+				char *, unsigned int *);
+	int (*create_mf_symlink)(unsigned int, struct cifs_tcon *,
+				 struct cifs_sb_info *, const unsigned char *,
+				 char *, unsigned int *);
+	/* if we can do cache read operations */
+	bool (*is_read_op)(__u32);
+	/* set oplock level for the inode */
+	void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int,
+				 bool *);
+	/* create lease context buffer for CREATE request */
+	char * (*create_lease_buf)(u8 *, u8);
+	/* parse lease context buffer and return oplock/epoch info */
+	__u8 (*parse_lease_buf)(void *, unsigned int *);
+	int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
+			struct cifsFileInfo *target_file, u64 src_off, u64 len,
+			u64 dest_off);
+	int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
+	ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
+			const unsigned char *, const unsigned char *, char *,
+			size_t, const struct nls_table *, int);
+	int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
+			const char *, const void *, const __u16,
+			const struct nls_table *, int);
+	struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
+			const char *, u32 *);
+	struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
+			const struct cifs_fid *, u32 *);
+	int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
+			int);
+};
+
+struct smb_version_values {
+	char		*version_string;
+	__u16		protocol_id;
+	__u32		req_capabilities;
+	__u32		large_lock_type;
+	__u32		exclusive_lock_type;
+	__u32		shared_lock_type;
+	__u32		unlock_lock_type;
+	size_t		header_size;
+	size_t		max_header_size;
+	size_t		read_rsp_size;
+	__le16		lock_cmd;
+	unsigned int	cap_unix;
+	unsigned int	cap_nt_find;
+	unsigned int	cap_large_files;
+	__u16		signing_enabled;
+	__u16		signing_required;
+	size_t		create_lease_size;
+};
+
+#define HEADER_SIZE(server) (server->vals->header_size)
+#define MAX_HEADER_SIZE(server) (server->vals->max_header_size)
+
+struct smb_vol {
+	char *username;
+	char *password;
+	char *domainname;
+	char *UNC;
+	char *iocharset;  /* local code page for mapping to and from Unicode */
+	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
+	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+	kuid_t cred_uid;
+	kuid_t linux_uid;
+	kgid_t linux_gid;
+	kuid_t backupuid;
+	kgid_t backupgid;
+	umode_t file_mode;
+	umode_t dir_mode;
+	enum securityEnum sectype; /* sectype requested via mnt opts */
+	bool sign; /* was signing requested via mnt opts? */
+	bool retry:1;
+	bool intr:1;
+	bool setuids:1;
+	bool override_uid:1;
+	bool override_gid:1;
+	bool dynperm:1;
+	bool noperm:1;
+	bool no_psx_acl:1; /* set if posix acl support should be disabled */
+	bool cifs_acl:1;
+	bool backupuid_specified; /* mount option  backupuid  is specified */
+	bool backupgid_specified; /* mount option  backupgid  is specified */
+	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
+	bool server_ino:1; /* use inode numbers from server ie UniqueId */
+	bool direct_io:1;
+	bool strict_io:1; /* strict cache behavior */
+	bool remap:1;      /* set to remap seven reserved chars in filenames */
+	bool posix_paths:1; /* unset to not ask for posix pathnames. */
+	bool no_linux_ext:1;
+	bool sfu_emul:1;
+	bool nullauth:1;   /* attempt to authenticate with null user */
+	bool nocase:1;     /* request case insensitive filenames */
+	bool nobrl:1;      /* disable sending byte range locks to srv */
+	bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
+	bool seal:1;       /* request transport encryption on share */
+	bool nodfs:1;      /* Do not request DFS, even if available */
+	bool local_lease:1; /* check leases only on local system, not remote */
+	bool noblocksnd:1;
+	bool noautotune:1;
+	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+	bool fsc:1;	/* enable fscache */
+	bool mfsymlinks:1; /* use Minshall+French Symlinks */
+	bool multiuser:1;
+	bool rwpidforward:1; /* pid forward for read/write operations */
+	bool nosharesock;
+	unsigned int rsize;
+	unsigned int wsize;
+	bool sockopt_tcp_nodelay:1;
+	unsigned long actimeo; /* attribute cache timeout (jiffies) */
+	struct smb_version_operations *ops;
+	struct smb_version_values *vals;
+	char *prepath;
+	struct sockaddr_storage dstaddr; /* destination address */
+	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
+	struct nls_table *local_nls;
+};
+
+#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
+			 CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
+			 CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
+			 CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
+			 CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
+			 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
+			 CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
+			 CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
+			 CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
+			 CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
+
+#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
+		      MS_NODEV | MS_SYNCHRONOUS)
+
+struct cifs_mnt_data {
+	struct cifs_sb_info *cifs_sb;
+	struct smb_vol *vol;
+	int flags;
+};
+
+static inline unsigned int
+get_rfc1002_length(void *buf)
+{
+	return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
+}
+
+static inline void
+inc_rfc1001_len(void *buf, int count)
+{
+	be32_add_cpu((__be32 *)buf, count);
+}
+
 struct TCP_Server_Info {
 	struct list_head tcp_ses_list;
 	struct list_head smb_ses_list;
 	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	struct smb_version_operations	*ops;
+	struct smb_version_values	*vals;
 	enum statusEnum tcpStatus; /* what we think the status is */
 	char *hostname; /* hostname portion of UNC string */
 	struct socket *ssocket;
@@ -175,14 +546,23 @@ struct TCP_Server_Info {
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
 	bool tcp_nodelay;
-	atomic_t inFlight;  /* number of requests on the wire to server */
+	int credits;  /* send no more requests at once */
+	unsigned int in_flight;  /* number of requests on the wire to server */
+	spinlock_t req_lock;  /* protect the two values above */
 	struct mutex srv_mutex;
 	struct task_struct *tsk;
 	char server_GUID[16];
-	char secMode;
+	__u16 sec_mode;
+	bool sign; /* is signing enabled on this connection? */
 	bool session_estab; /* mark when very first sess is established */
+#ifdef CONFIG_CIFS_SMB2
+	int echo_credits;  /* echo reserved slots */
+	int oplock_credits;  /* oplock break reserved slots */
+	bool echoes:1; /* enable echoes */
+	__u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */
+#endif
 	u16 dialect; /* dialect index that server chose */
-	enum securityEnum secType;
+	bool oplocks:1; /* enable oplocks */
 	unsigned int maxReq;	/* Clients should submit no more */
 	/* than maxReq distinct unanswered SMBs to the server when using  */
 	/* multiplexed reads or writes */
@@ -193,12 +573,9 @@ struct TCP_Server_Info {
 	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
-	unsigned int max_vcs;	/* maximum number of smb sessions, at least
-				   those that can be specified uniquely with
-				   vcnumbers */
-	int capabilities; /* allow selective disabling of caps by smb sess */
+	unsigned int capabilities; /* selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
-	__u16 CurrentMid;         /* multiplex id - rotating counter */
+	__u64 CurrentMid;         /* multiplex id - rotating counter */
 	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
@@ -206,21 +583,142 @@ struct TCP_Server_Info {
 	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
 	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+#define	CIFS_NEGFLAVOR_LANMAN	0	/* wct == 13, LANMAN */
+#define	CIFS_NEGFLAVOR_UNENCAP	1	/* wct == 17, but no ext_sec */
+#define	CIFS_NEGFLAVOR_EXTENDED	2	/* wct == 17, ext_sec bit set */
+	char	negflavor;	/* NEGOTIATE response flavor */
 	/* extended security flavors that server supports */
 	bool	sec_ntlmssp;		/* supports NTLMSSP */
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	bool	large_buf;		/* is current buffer large? */
 	struct delayed_work	echo; /* echo ping workqueue job */
+	struct kvec *iov;	/* reusable kvec array for receives */
+	unsigned int nr_iov;	/* number of kvecs in array */
+	char	*smallbuf;	/* pointer to current "small" buffer */
+	char	*bigbuf;	/* pointer to current "big" buffer */
+	unsigned int total_read; /* total amount of data read in this pass */
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
 #ifdef CONFIG_CIFS_STATS2
-	atomic_t inSend; /* requests trying to send */
+	atomic_t in_send; /* requests trying to send */
 	atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
 #endif
+#ifdef CONFIG_CIFS_SMB2
+	unsigned int	max_read;
+	unsigned int	max_write;
+#endif /* CONFIG_CIFS_SMB2 */
 };
 
+static inline unsigned int
+in_flight(struct TCP_Server_Info *server)
+{
+	unsigned int num;
+	spin_lock(&server->req_lock);
+	num = server->in_flight;
+	spin_unlock(&server->req_lock);
+	return num;
+}
+
+static inline bool
+has_credits(struct TCP_Server_Info *server, int *credits)
+{
+	int num;
+	spin_lock(&server->req_lock);
+	num = *credits;
+	spin_unlock(&server->req_lock);
+	return num > 0;
+}
+
+static inline void
+add_credits(struct TCP_Server_Info *server, const unsigned int add,
+	    const int optype)
+{
+	server->ops->add_credits(server, add, optype);
+}
+
+static inline void
+set_credits(struct TCP_Server_Info *server, const int val)
+{
+	server->ops->set_credits(server, val);
+}
+
+static inline __u64
+get_next_mid64(struct TCP_Server_Info *server)
+{
+	return server->ops->get_next_mid(server);
+}
+
+static inline __le16
+get_next_mid(struct TCP_Server_Info *server)
+{
+	__u16 mid = get_next_mid64(server);
+	/*
+	 * The value in the SMB header should be little endian for easy
+	 * on-the-wire decoding.
+	 */
+	return cpu_to_le16(mid);
+}
+
+static inline __u16
+get_mid(const struct smb_hdr *smb)
+{
+	return le16_to_cpu(smb->Mid);
+}
+
+static inline bool
+compare_mid(__u16 mid, const struct smb_hdr *smb)
+{
+	return mid == le16_to_cpu(smb->Mid);
+}
+
+/*
+ * When the server supports very large reads and writes via POSIX extensions,
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
+ * including the RFC1001 length.
+ *
+ * Note that this might make for "interesting" allocation problems during
+ * writeback however as we have to allocate an array of pointers for the
+ * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
+ */
+#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
+
+/*
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
+ * of 2^17-1 minus the size of the call header. That allows for a read or
+ * write up to the maximum size described by RFC1002.
+ */
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
+
+/*
+ * The default wsize is 1M. find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * a single wsize request with a single call.
+ */
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+/*
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
+
 /*
  * Macros to allow the TCP_Server_Info->net field and related code to drop out
  * when CONFIG_NET_NS isn't set.
@@ -254,7 +752,7 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
 /*
  * Session structure.  One of these for each uid session with a particular host
  */
-struct cifsSesInfo {
+struct cifs_ses {
 	struct list_head smb_ses_list;
 	struct list_head tcon_list;
 	struct mutex session_mutex;
@@ -263,70 +761,84 @@ struct cifsSesInfo {
 	enum statusEnum status;
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
-	__u16 flags;
-	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
-	int Suid;		/* remote smb uid  */
-	uid_t linux_uid;        /* overriding owner of files on the mount */
-	uid_t cred_uid;		/* owner of credentials */
-	int capabilities;
+	__u64 Suid;		/* remote smb uid  */
+	kuid_t linux_uid;	/* overriding owner of files on the mount */
+	kuid_t cred_uid;	/* owner of credentials */
+	unsigned int capabilities;
 	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for
 				TCP names - will ipv6 and sctp addresses fit? */
-	char userName[MAX_USERNAME_SIZE + 1];
+	char *user_name;	/* must not be null except during init of sess
+				   and after mount option parsing we fill it */
 	char *domainName;
 	char *password;
 	struct session_key auth_key;
 	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+	enum securityEnum sectype; /* what security flavor was specified? */
+	bool sign;		/* is signing required? */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
+#ifdef CONFIG_CIFS_SMB2
+	__u16 session_flags;
+	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
+#endif /* CONFIG_CIFS_SMB2 */
 };
-/* no more than one of the following three session flags may be set */
-#define CIFS_SES_NT4 1
-#define CIFS_SES_OS2 2
-#define CIFS_SES_W9X 4
-/* following flag is set for old servers such as OS2 (and Win95?)
-   which do not negotiate NTLM or POSIX dialects, but instead
-   negotiate one of the older LANMAN dialects */
-#define CIFS_SES_LANMAN 8
+
+static inline bool
+cap_unix(struct cifs_ses *ses)
+{
+	return ses->server->vals->cap_unix & ses->capabilities;
+}
+
 /*
  * there is one of these for each connection to a resource on a particular
  * session
  */
-struct cifsTconInfo {
+struct cifs_tcon {
 	struct list_head tcon_list;
 	int tc_count;
 	struct list_head openFileList;
-	struct cifsSesInfo *ses;	/* pointer to session associated with */
+	struct cifs_ses *ses;	/* pointer to session associated with */
 	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
 	char *nativeFileSystem;
 	char *password;		/* for share-level security */
-	__u16 tid;		/* The 2 byte tree id */
+	__u32 tid;		/* The 4 byte tree id */
 	__u16 Flags;		/* optional support bits */
 	enum statusEnum tidStatus;
 #ifdef CONFIG_CIFS_STATS
 	atomic_t num_smbs_sent;
-	atomic_t num_writes;
-	atomic_t num_reads;
-	atomic_t num_flushes;
-	atomic_t num_oplock_brks;
-	atomic_t num_opens;
-	atomic_t num_closes;
-	atomic_t num_deletes;
-	atomic_t num_mkdirs;
-	atomic_t num_posixopens;
-	atomic_t num_posixmkdirs;
-	atomic_t num_rmdirs;
-	atomic_t num_renames;
-	atomic_t num_t2renames;
-	atomic_t num_ffirst;
-	atomic_t num_fnext;
-	atomic_t num_fclose;
-	atomic_t num_hardlinks;
-	atomic_t num_symlinks;
-	atomic_t num_locks;
-	atomic_t num_acl_get;
-	atomic_t num_acl_set;
+	union {
+		struct {
+			atomic_t num_writes;
+			atomic_t num_reads;
+			atomic_t num_flushes;
+			atomic_t num_oplock_brks;
+			atomic_t num_opens;
+			atomic_t num_closes;
+			atomic_t num_deletes;
+			atomic_t num_mkdirs;
+			atomic_t num_posixopens;
+			atomic_t num_posixmkdirs;
+			atomic_t num_rmdirs;
+			atomic_t num_renames;
+			atomic_t num_t2renames;
+			atomic_t num_ffirst;
+			atomic_t num_fnext;
+			atomic_t num_fclose;
+			atomic_t num_hardlinks;
+			atomic_t num_symlinks;
+			atomic_t num_locks;
+			atomic_t num_acl_get;
+			atomic_t num_acl_set;
+		} cifs_stats;
+#ifdef CONFIG_CIFS_SMB2
+		struct {
+			atomic_t smb2_com_sent[NUMBER_OF_SMB2_COMMANDS];
+			atomic_t smb2_com_failed[NUMBER_OF_SMB2_COMMANDS];
+		} smb2_stats;
+#endif /* CONFIG_CIFS_SMB2 */
+	} stats;
 #ifdef CONFIG_CIFS_STATS2
 	unsigned long long time_writes;
 	unsigned long long time_reads;
@@ -357,10 +869,25 @@ struct cifsTconInfo {
 	bool local_lease:1; /* check leases (only) on local system not remote */
 	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_SMB2
+	bool print:1;		/* set if connection to printer share */
+	bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
+	__le32 capabilities;
+	__u32 share_flags;
+	__u32 maximal_access;
+	__u32 vol_serial_number;
+	__le64 vol_create_time;
+	__u32 ss_flags;		/* sector size flags */
+	__u32 perf_sector_size; /* best sector size for perf */
+	__u32 max_chunks;
+	__u32 max_bytes_chunk;
+	__u32 max_bytes_copy;
+#endif /* CONFIG_CIFS_SMB2 */
 #ifdef CONFIG_CIFS_FSCACHE
 	u64 resource_id;		/* server resource id */
 	struct fscache_cookie *fscache;	/* cookie for share */
 #endif
+	struct list_head pending_opens;	/* list of incomplete opens */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
@@ -372,19 +899,19 @@ struct cifsTconInfo {
  */
 struct tcon_link {
 	struct rb_node		tl_rbnode;
-	uid_t			tl_uid;
+	kuid_t			tl_uid;
 	unsigned long		tl_flags;
 #define TCON_LINK_MASTER	0
 #define TCON_LINK_PENDING	1
 #define TCON_LINK_IN_TREE	2
 	unsigned long		tl_time;
 	atomic_t		tl_count;
-	struct cifsTconInfo	*tl_tcon;
+	struct cifs_tcon	*tl_tcon;
 };
 
 extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
 
-static inline struct cifsTconInfo *
+static inline struct cifs_tcon *
 tlink_tcon(struct tcon_link *tlink)
 {
 	return tlink->tl_tcon;
@@ -401,7 +928,16 @@ cifs_get_tlink(struct tcon_link *tlink)
 }
 
 /* This function is always expected to succeed */
-extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+
+#define CIFS_OPLOCK_NO_CHANGE 0xfe
+
+struct cifs_pending_open {
+	struct list_head olist;
+	struct tcon_link *tlink;
+	__u8 lease_key[16];
+	__u32 oplock;
+};
 
 /*
  * This info hangs off the cifsFileInfo structure, pointed to by llist.
@@ -409,9 +945,12 @@ extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
  */
 struct cifsLockInfo {
 	struct list_head llist;	/* pointer to next cifsLockInfo */
+	struct list_head blist; /* pointer to locks blocked on this */
+	wait_queue_head_t block_q;
 	__u64 offset;
 	__u64 length;
-	__u8 type;
+	__u32 pid;
+	__u32 type;
 };
 
 /*
@@ -425,7 +964,7 @@ struct cifs_search_info {
 	char *ntwrk_buf_start;
 	char *srch_entries_start;
 	char *last_entry;
-	char *presume_name;
+	const char *presume_name;
 	unsigned int resume_name_len;
 	bool endOfSearch:1;
 	bool emptyDir:1;
@@ -433,19 +972,47 @@ struct cifs_search_info {
 	bool smallBuf:1; /* so we know which buf_release function to call */
 };
 
+struct cifs_open_parms {
+	struct cifs_tcon *tcon;
+	struct cifs_sb_info *cifs_sb;
+	int disposition;
+	int desired_access;
+	int create_options;
+	const char *path;
+	struct cifs_fid *fid;
+	bool reconnect:1;
+};
+
+struct cifs_fid {
+	__u16 netfid;
+#ifdef CONFIG_CIFS_SMB2
+	__u64 persistent_fid;	/* persist file id for smb2 */
+	__u64 volatile_fid;	/* volatile file id for smb2 */
+	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for smb2 */
+#endif
+	struct cifs_pending_open *pending_open;
+	unsigned int epoch;
+	bool purge_cache;
+};
+
+struct cifs_fid_locks {
+	struct list_head llist;
+	struct cifsFileInfo *cfile;	/* fid that owns locks */
+	struct list_head locks;		/* locks held by fid above */
+};
+
 struct cifsFileInfo {
 	struct list_head tlist;	/* pointer to next fid owned by tcon */
 	struct list_head flist;	/* next fid (file instance) for this inode */
-	unsigned int uid;	/* allows finding which FileInfo structure */
+	struct cifs_fid_locks *llist;	/* brlocks held by this fid */
+	kuid_t uid;		/* allows finding which FileInfo structure */
 	__u32 pid;		/* process id who opened file */
-	__u16 netfid;		/* file id from remote */
+	struct cifs_fid fid;	/* file id from remote */
 	/* BB add lock scope info here if needed */ ;
 	/* lock scope id (0 if none) */
 	struct dentry *dentry;
 	unsigned int f_flags;
 	struct tcon_link *tlink;
-	struct mutex lock_mutex;
-	struct list_head llist; /* list of byte range locks we have. */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool oplock_break_cancelled:1;
 	int count;		/* refcount protected by cifs_file_list_lock */
@@ -454,34 +1021,115 @@ struct cifsFileInfo {
 	struct work_struct oplock_break; /* work for oplock breaks */
 };
 
+struct cifs_io_parms {
+	__u16 netfid;
+#ifdef CONFIG_CIFS_SMB2
+	__u64 persistent_fid;	/* persist file id for smb2 */
+	__u64 volatile_fid;	/* volatile file id for smb2 */
+#endif
+	__u32 pid;
+	__u64 offset;
+	unsigned int length;
+	struct cifs_tcon *tcon;
+};
+
+struct cifs_readdata;
+
+/* asynchronous read support */
+struct cifs_readdata {
+	struct kref			refcount;
+	struct list_head		list;
+	struct completion		done;
+	struct cifsFileInfo		*cfile;
+	struct address_space		*mapping;
+	__u64				offset;
+	unsigned int			bytes;
+	pid_t				pid;
+	int				result;
+	struct work_struct		work;
+	int (*read_into_pages)(struct TCP_Server_Info *server,
+				struct cifs_readdata *rdata,
+				unsigned int len);
+	struct kvec			iov;
+	unsigned int			pagesz;
+	unsigned int			tailsz;
+	unsigned int			nr_pages;
+	struct page			*pages[];
+};
+
+struct cifs_writedata;
+
+/* asynchronous write support */
+struct cifs_writedata {
+	struct kref			refcount;
+	struct list_head		list;
+	struct completion		done;
+	enum writeback_sync_modes	sync_mode;
+	struct work_struct		work;
+	struct cifsFileInfo		*cfile;
+	__u64				offset;
+	pid_t				pid;
+	unsigned int			bytes;
+	int				result;
+	unsigned int			pagesz;
+	unsigned int			tailsz;
+	unsigned int			nr_pages;
+	struct page			*pages[];
+};
+
 /*
  * Take a reference on the file private data. Must be called with
  * cifs_file_list_lock held.
  */
-static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+static inline void
+cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
 {
 	++cifs_file->count;
 }
 
+struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 
+#define CIFS_CACHE_READ_FLG	1
+#define CIFS_CACHE_HANDLE_FLG	2
+#define CIFS_CACHE_RH_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG)
+#define CIFS_CACHE_WRITE_FLG	4
+#define CIFS_CACHE_RW_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
+#define CIFS_CACHE_RHW_FLG	(CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
+
+#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG)
+#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
+#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG)
+
 /*
  * One of these for each file inode
  */
 
 struct cifsInodeInfo {
-	struct list_head lockList;
+	bool can_cache_brlcks;
+	struct list_head llist;	/* locks helb by this inode */
+	struct rw_semaphore lock_sem;	/* protect the fields above */
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-	bool clientCanCacheRead;	/* read oplock */
-	bool clientCanCacheAll;		/* read and writebehind oplock */
-	bool delete_pending;		/* DELETE_ON_CLOSE is set */
-	bool invalid_mapping;		/* pagecache is invalid */
+	unsigned int oplock;		/* oplock/lease level we have */
+	unsigned int epoch;		/* used to track lease state changes */
+#define CIFS_INODE_PENDING_OPLOCK_BREAK   (0) /* oplock break in progress */
+#define CIFS_INODE_PENDING_WRITERS	  (1) /* Writes in progress */
+#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+#define CIFS_INO_DELETE_PENDING		  (3) /* delete pending on server */
+#define CIFS_INO_INVALID_MAPPING	  (4) /* pagecache is invalid */
+#define CIFS_INO_LOCK			  (5) /* lock bit for synchronization */
+	unsigned long flags;
+	spinlock_t writers_lock;
+	unsigned int writers;		/* Number of writers on this inode */
 	unsigned long time;		/* jiffies of last update of inode */
-	u64  server_eof;		/* current file size on server */
+	u64  server_eof;		/* current file size on server -- protected by i_lock */
 	u64  uniqueid;			/* server inode number */
 	u64  createtime;		/* creation time on server */
+#ifdef CONFIG_CIFS_SMB2
+	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for this inode */
+#endif
 #ifdef CONFIG_CIFS_FSCACHE
 	struct fscache_cookie *fscache;
 #endif
@@ -508,10 +1156,25 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
 		return '\\';
 }
 
+static inline void
+convert_delimiter(char *path, char delim)
+{
+	char old_delim, *pos;
+
+	if (delim == '/')
+		old_delim = '\\';
+	else
+		old_delim = '/';
+
+	pos = path;
+	while ((pos = strchr(pos, old_delim)))
+		*pos = delim;
+}
+
 #ifdef CONFIG_CIFS_STATS
 #define cifs_stats_inc atomic_inc
 
-static inline void cifs_stats_bytes_written(struct cifsTconInfo *tcon,
+static inline void cifs_stats_bytes_written(struct cifs_tcon *tcon,
 					    unsigned int bytes)
 {
 	if (bytes) {
@@ -521,7 +1184,7 @@ static inline void cifs_stats_bytes_written(struct cifsTconInfo *tcon,
 	}
 }
 
-static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
+static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
 					 unsigned int bytes)
 {
 	spin_lock(&tcon->stat_lock);
@@ -536,45 +1199,103 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
 
 #endif
 
-struct mid_q_entry;
 
 /*
- * This is the prototype for the mid callback function. When creating one,
- * take special care to avoid deadlocks. Things to bear in mind:
+ * This is the prototype for the mid receive function. This function is for
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+			    struct mid_q_entry *mid);
+
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
  *
- * - it will be called by cifsd
- * - the GlobalMid_Lock will be held
- * - the mid will be removed from the pending_mid_q list
+ * - it will be called by cifsd, with no locks held
+ * - the mid will be removed from any lists
  */
 typedef void (mid_callback_t)(struct mid_q_entry *mid);
 
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
 	struct list_head qhead;	/* mids waiting on reply from this server */
-	__u16 mid;		/* multiplex id */
-	__u16 pid;		/* process id */
+	struct TCP_Server_Info *server;	/* server corresponding to this mid */
+	__u64 mid;		/* multiplex id */
+	__u32 pid;		/* process id */
 	__u32 sequence_number;  /* for CIFS signing */
 	unsigned long when_alloc;  /* when mid was created */
 #ifdef CONFIG_CIFS_STATS2
 	unsigned long when_sent; /* time when smb send finished */
 	unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
+	mid_receive_t *receive; /* call receive callback */
 	mid_callback_t *callback; /* call completion callback */
 	void *callback_data;	  /* general purpose pointer for callback */
-	struct smb_hdr *resp_buf;	/* response buffer */
-	int midState;	/* wish this were enum but can not pass to wait_event */
-	__u8 command;	/* smb command code */
-	bool largeBuf:1;	/* if valid response, is pointer to large buf */
+	void *resp_buf;		/* pointer to received SMB header */
+	int mid_state;	/* wish this were enum but can not pass to wait_event */
+	__le16 command;		/* smb command code */
+	bool large_buf:1;	/* if valid response, is pointer to large buf */
 	bool multiRsp:1;	/* multiple trans2 responses for one request  */
 	bool multiEnd:1;	/* both received */
 };
 
-struct oplock_q_entry {
-	struct list_head qhead;
-	struct inode *pinode;
-	struct cifsTconInfo *tcon;
-	__u16 netfid;
-};
+/*	Make code in transport.c a little cleaner by moving
+	update of optional stats into function below */
+#ifdef CONFIG_CIFS_STATS2
+
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+	atomic_inc(&server->in_send);
+}
+
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+	atomic_dec(&server->in_send);
+}
+
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+	atomic_inc(&server->num_waiters);
+}
+
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+	atomic_dec(&server->num_waiters);
+}
+
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+	mid->when_sent = jiffies;
+}
+#else
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+}
+#endif
 
 /* for pending dnotify requests */
 struct dir_notify_req {
@@ -608,6 +1329,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DELETE_PENDING	0x2
 #define CIFS_FATTR_NEED_REVAL		0x4
 #define CIFS_FATTR_INO_COLLISION	0x8
+#define CIFS_FATTR_UNKNOWN_NLINK	0x10
 
 struct cifs_fattr {
 	u32		cf_flags;
@@ -616,8 +1338,8 @@ struct cifs_fattr {
 	u64		cf_eof;
 	u64		cf_bytes;
 	u64		cf_createtime;
-	uid_t		cf_uid;
-	gid_t		cf_gid;
+	kuid_t		cf_uid;
+	kgid_t		cf_gid;
 	umode_t		cf_mode;
 	dev_t		cf_rdev;
 	unsigned int	cf_nlink;
@@ -655,6 +1377,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
 #define   MID_RESPONSE_MALFORMED 0x10
+#define   MID_SHUTDOWN		 0x20
 
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
@@ -670,6 +1393,12 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
 #define   CIFS_NO_RESP      0x040    /* no response buffer required */
 
+/* Type of request operation */
+#define   CIFS_ECHO_OP      0x080    /* echo request */
+#define   CIFS_OBREAK_OP   0x0100    /* oplock break request */
+#define   CIFS_NEG_OP      0x0200    /* negotiate request */
+#define   CIFS_OP_MASK     0x0380    /* mask request type */
+
 /* Security Flags: indicate type of session setup needed */
 #define   CIFSSEC_MAY_SIGN	0x00001
 #define   CIFSSEC_MAY_NTLM	0x00002
@@ -701,6 +1430,8 @@ require use of the stronger protocol */
 #define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */
 #endif /* UPCALL */
 #else /* do not allow weak pw hash */
+#define   CIFSSEC_MUST_LANMAN	0
+#define   CIFSSEC_MUST_PLNTXT	0
 #ifdef CONFIG_CIFS_UPCALL
 #define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
 #else
@@ -710,7 +1441,7 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
 #define   CIFSSEC_MUST_NTLMSSP	0x80080 /* raw ntlmssp with ntlmv2 */
 
-#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
@@ -780,10 +1511,12 @@ GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
  */
 GLOBAL_EXTERN spinlock_t	cifs_file_list_lock;
 
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
 /* DirNotify response queue */
 GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
 
 /*
  * Global transaction id (XID) information
@@ -812,12 +1545,7 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
 GLOBAL_EXTERN atomic_t midCount;
 
 /* Misc globals */
-GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
-				to be established on existing mount if we
-				have the uid/password or Kerberos credential
-				or equivalent for current user */
-GLOBAL_EXTERN unsigned int oplockEnabled;
-GLOBAL_EXTERN unsigned int experimEnabled;
+GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
 				with more secure ntlmssp2 challenge/resp */
@@ -828,13 +1556,38 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 
-/* reconnect after this many failed echo attempts */
-GLOBAL_EXTERN unsigned short echo_retries;
+#ifdef CONFIG_CIFS_ACL
+GLOBAL_EXTERN struct rb_root uidtree;
+GLOBAL_EXTERN struct rb_root gidtree;
+GLOBAL_EXTERN spinlock_t siduidlock;
+GLOBAL_EXTERN spinlock_t sidgidlock;
+GLOBAL_EXTERN struct rb_root siduidtree;
+GLOBAL_EXTERN struct rb_root sidgidtree;
+GLOBAL_EXTERN spinlock_t uidsidlock;
+GLOBAL_EXTERN spinlock_t gidsidlock;
+#endif /* CONFIG_CIFS_ACL */
 
 void cifs_oplock_break(struct work_struct *work);
-void cifs_oplock_break_get(struct cifsFileInfo *cfile);
-void cifs_oplock_break_put(struct cifsFileInfo *cfile);
 
 extern const struct slow_work_ops cifs_oplock_break_ops;
-
+extern struct workqueue_struct *cifsiod_wq;
+
+extern mempool_t *cifs_mid_poolp;
+
+/* Operations for different SMB versions */
+#define SMB1_VERSION_STRING	"1.0"
+extern struct smb_version_operations smb1_operations;
+extern struct smb_version_values smb1_values;
+#define SMB20_VERSION_STRING	"2.0"
+extern struct smb_version_operations smb20_operations;
+extern struct smb_version_values smb20_values;
+#define SMB21_VERSION_STRING	"2.1"
+extern struct smb_version_operations smb21_operations;
+extern struct smb_version_values smb21_values;
+#define SMB30_VERSION_STRING	"3.0"
+extern struct smb_version_operations smb30_operations;
+extern struct smb_version_values smb30_values;
+#define SMB302_VERSION_STRING	"3.02"
+/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
+extern struct smb_version_values smb302_values;
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b5c8cc5d7a7..33df36ef9d5 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
  */
 #define CIFS_SESS_KEY_SIZE (16)
 
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE (16)
+
 #define CIFS_CLIENT_CHALLENGE_SIZE (8)
 #define CIFS_SERVER_CHALLENGE_SIZE (8)
 #define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -277,7 +282,6 @@
 #define CIFS_NO_HANDLE        0xFFFF
 
 #define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL
-#define NO_CHANGE_32          0xFFFFFFFFUL
 
 /* IPC$ in ASCII */
 #define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
@@ -397,9 +401,9 @@
 #define GETU32(var)  (*((__u32 *)var))	/* BB check for endian issues */
 
 struct smb_hdr {
-	__u32 smb_buf_length;	/* big endian on wire *//* BB length is only two
-		or three bytes - with one or two byte type preceding it that are
-		zero - we could mask the type byte off just in case BB */
+	__be32 smb_buf_length;	/* BB length is only two (rarely three) bytes,
+		with one or two byte "type" preceding it that will be
+		zero - we could mask the type byte off */
 	__u8 Protocol[4];
 	__u8 Command;
 	union {
@@ -424,47 +428,32 @@ struct smb_hdr {
 	__u16 Tid;
 	__le16 Pid;
 	__u16 Uid;
-	__u16 Mid;
+	__le16 Mid;
 	__u8 WordCount;
 } __attribute__((packed));
 
-/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
-#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
-			 (2 * (smb_var)->WordCount))
+/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */
+static inline void *
+BCC(struct smb_hdr *smb)
+{
+	return (void *)smb + sizeof(*smb) + 2 * smb->WordCount;
+}
 
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
 #define pByteArea(smb_var) (BCC(smb_var) + 2)
 
-/* get the converted ByteCount for a SMB packet and return it */
-static inline __u16
-get_bcc(struct smb_hdr *hdr)
-{
-	__u16 *bc_ptr = (__u16 *)BCC(hdr);
-
-	return get_unaligned(bc_ptr);
-}
-
 /* get the unconverted ByteCount for a SMB packet and return it */
 static inline __u16
-get_bcc_le(struct smb_hdr *hdr)
+get_bcc(struct smb_hdr *hdr)
 {
 	__le16 *bc_ptr = (__le16 *)BCC(hdr);
 
 	return get_unaligned_le16(bc_ptr);
 }
 
-/* set the ByteCount for a SMB packet in host-byte order */
-static inline void
-put_bcc(__u16 count, struct smb_hdr *hdr)
-{
-	__u16 *bc_ptr = (__u16 *)BCC(hdr);
-
-	put_unaligned(count, bc_ptr);
-}
-
 /* set the ByteCount for a SMB packet in little-endian */
 static inline void
-put_bcc_le(__u16 count, struct smb_hdr *hdr)
+put_bcc(__u16 count, struct smb_hdr *hdr)
 {
 	__le16 *bc_ptr = (__le16 *)BCC(hdr);
 
@@ -547,7 +536,7 @@ typedef struct lanman_neg_rsp {
 #define READ_RAW_ENABLE 1
 #define WRITE_RAW_ENABLE 2
 #define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
-
+#define SMB1_CLIENT_GUID_SIZE (16)
 typedef struct negotiate_rsp {
 	struct smb_hdr hdr;	/* wct = 17 */
 	__le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -569,7 +558,7 @@ typedef struct negotiate_rsp {
 		/* followed by 16 bytes of server GUID */
 		/* then security blob if cap_extended_security negotiated */
 		struct {
-			unsigned char GUID[16];
+			unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
 			unsigned char SecurityBlob[1];
 		} __attribute__((packed)) extended_response;
 	} __attribute__((packed)) u;
@@ -708,7 +697,13 @@ struct ntlmssp2_name {
 } __attribute__((packed));
 
 struct ntlmv2_resp {
-	char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	union {
+	    char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	    struct {
+		__u8 reserved[8];
+		__u8 key[CIFS_SERVER_CHALLENGE_SIZE];
+	    } __attribute__((packed)) challenge;
+	} __attribute__((packed));
 	__le32 blob_signature;
 	__u32  reserved;
 	__le64  time;
@@ -1104,9 +1099,7 @@ typedef struct smb_com_read_rsp {
 	__le16 DataLengthHigh;
 	__u64 Reserved2;
 	__u16 ByteCount;
-	__u8 Pad;		/* BB check for whether padded to DWORD
-				   boundary and optimum performance here */
-	char Data[1];
+	/* read response data immediately follows */
 } __attribute__((packed)) READ_RSP;
 
 typedef struct locking_andx_range {
@@ -1333,6 +1326,14 @@ typedef struct smb_com_ntransact_rsp {
 	/* parms and data follow */
 } __attribute__((packed)) NTRANSACT_RSP;
 
+/* See MS-SMB 2.2.7.2.1.1 */
+struct srv_copychunk {
+	__le64 SourceOffset;
+	__le64 DestinationOffset;
+	__le32 CopyLength;
+	__u32  Reserved;
+} __packed;
+
 typedef struct smb_com_transaction_ioctl_req {
 	struct smb_hdr hdr;	/* wct = 23 */
 	__u8 MaxSetupCount;
@@ -1357,6 +1358,35 @@ typedef struct smb_com_transaction_ioctl_req {
 	__u8 Data[1];
 } __attribute__((packed)) TRANSACT_IOCTL_REQ;
 
+typedef struct smb_com_transaction_compr_ioctl_req {
+	struct smb_hdr hdr;	/* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	__le32 FunctionCode;
+	__u16 Fid;
+	__u8 IsFsctl;  /* 1 = File System Control 0 = device control (IOCTL) */
+	__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__le16 compression_state;  /* See below for valid flags */
+} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ;
+
+/* compression state flags */
+#define COMPRESSION_FORMAT_NONE		0x0000
+#define COMPRESSION_FORMAT_DEFAULT	0x0001
+#define COMPRESSION_FORMAT_LZNT1	0x0002
+
 typedef struct smb_com_transaction_ioctl_rsp {
 	struct smb_hdr hdr;	/* wct = 19 */
 	__u8 Reserved[3];
@@ -1496,15 +1526,31 @@ struct file_notify_information {
 	__u8  FileName[0];
 } __attribute__((packed));
 
-struct reparse_data {
-	__u32	ReparseTag;
-	__u16	ReparseDataLength;
+/* For IO_REPARSE_TAG_SYMLINK */
+struct reparse_symlink_data {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
 	__u16	Reserved;
-	__u16	AltNameOffset;
-	__u16	AltNameLen;
-	__u16	TargetNameOffset;
-	__u16	TargetNameLen;
-	char	LinkNamesBuf[1];
+	__le16	SubstituteNameOffset;
+	__le16	SubstituteNameLength;
+	__le16	PrintNameOffset;
+	__le16	PrintNameLength;
+	__le32	Flags;
+	char	PathBuffer[0];
+} __attribute__((packed));
+
+/* For IO_REPARSE_TAG_NFS */
+#define NFS_SPECFILE_LNK	0x00000000014B4E4C
+#define NFS_SPECFILE_CHR	0x0000000000524843
+#define NFS_SPECFILE_BLK	0x00000000004B4C42
+#define NFS_SPECFILE_FIFO	0x000000004F464946
+#define NFS_SPECFILE_SOCK	0x000000004B434F53
+struct reparse_posix_data {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le64	InodeType; /* LNK, FIFO, CHR etc. */
+	char	PathBuffer[0];
 } __attribute__((packed));
 
 struct cifs_quota_data {
@@ -1928,6 +1974,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
 
 /* SETFSInfo Levels */
 #define SMB_SET_CIFS_UNIX_INFO    0x200
+/* level 0x203 is defined above in list of QFS info levels */
+/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
+
+/* Level 0x200 request structure follows */
 typedef struct smb_com_transaction2_setfsi_req {
 	struct smb_hdr hdr;	/* wct = 15 */
 	__le16 TotalParameterCount;
@@ -1955,13 +2005,39 @@ typedef struct smb_com_transaction2_setfsi_req {
 	__le64 ClientUnixCap;   /* Data end */
 } __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
 
+/* level 0x203 request structure follows */
+typedef struct smb_com_transaction2_setfs_enc_req {
+	struct smb_hdr hdr;	/* wct = 15 */
+	__le16 TotalParameterCount;
+	__le16 TotalDataCount;
+	__le16 MaxParameterCount;
+	__le16 MaxDataCount;
+	__u8 MaxSetupCount;
+	__u8 Reserved;
+	__le16 Flags;
+	__le32 Timeout;
+	__u16 Reserved2;
+	__le16 ParameterCount;	/* 4 */
+	__le16 ParameterOffset;
+	__le16 DataCount;	/* 12 */
+	__le16 DataOffset;
+	__u8 SetupCount;	/* one */
+	__u8 Reserved3;
+	__le16 SubCommand;	/* TRANS2_SET_FS_INFORMATION */
+	__le16 ByteCount;
+	__u8 Pad;
+	__u16  Reserved4;	/* Parameters start. */
+	__le16 InformationLevel;/* Parameters end. */
+	/* NTLMSSP Blob, Data start. */
+} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
+
+/* response for setfsinfo levels 0x200 and 0x203 */
 typedef struct smb_com_transaction2_setfsi_rsp {
 	struct smb_hdr hdr;	/* wct = 10 */
 	struct trans2_resp t2;
 	__u16 ByteCount;
 } __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
 
-
 typedef struct smb_com_transaction2_get_dfs_refer_req {
 	struct smb_hdr hdr;	/* wct = 15 */
 	__le16 TotalParameterCount;
@@ -2113,13 +2189,13 @@ typedef struct {
 #define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
 						      QFS PROXY call */
 #ifdef CONFIG_CIFS_POSIX
-/* Can not set pathnames cap yet until we send new posix create SMB since
-   otherwise server can treat such handles opened with older ntcreatex
-   (by a new client which knows how to send posix path ops)
-   as non-posix handles (can affect write behavior with byte range locks.
-   We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
+   LockingX instead of posix locking call on unix sess (and we do not expect
+   LockingX to use different (ie Windows) semantics than posix locking on
+   the same session (if WINE needs to do this later, we can add this cap
+   back in later */
 /* #define CIFS_UNIX_CAP_MASK              0x000000fb */
-#define CIFS_UNIX_CAP_MASK              0x000000db
+#define CIFS_UNIX_CAP_MASK              0x000003db
 #else
 #define CIFS_UNIX_CAP_MASK              0x00000013
 #endif /* CONFIG_CIFS_POSIX */
@@ -2174,6 +2250,9 @@ typedef struct {
 	__le32 DeviceCharacteristics;
 } __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
 
+/* minimum includes first three fields, and empty FS Name */
+#define MIN_FS_ATTR_INFO_SIZE 12
+
 typedef struct {
 	__le32 Attributes;
 	__le32 MaxPathNameComponentLength;
@@ -2197,7 +2276,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
 	__u8 DeletePending;
 	__u8 Directory;
 	__u16 Pad2;
-	__u64 IndexNumber;
+	__le64 IndexNumber;
 	__le32 EASize;
 	__le32 AccessFlags;
 	__u64 IndexNumber1;
@@ -2626,26 +2705,7 @@ typedef struct file_xattr_info {
 } __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
 					      level 0x205 */
 
-
-/* flags for chattr command */
-#define EXT_SECURE_DELETE		0x00000001 /* EXT3_SECRM_FL */
-#define EXT_ENABLE_UNDELETE		0x00000002 /* EXT3_UNRM_FL */
-/* Reserved for compress file 0x4 */
-#define EXT_SYNCHRONOUS			0x00000008 /* EXT3_SYNC_FL */
-#define EXT_IMMUTABLE_FL		0x00000010 /* EXT3_IMMUTABLE_FL */
-#define EXT_OPEN_APPEND_ONLY		0x00000020 /* EXT3_APPEND_FL */
-#define EXT_DO_NOT_BACKUP		0x00000040 /* EXT3_NODUMP_FL */
-#define EXT_NO_UPDATE_ATIME		0x00000080 /* EXT3_NOATIME_FL */
-/* 0x100 through 0x800 reserved for compression flags and are GET-ONLY */
-#define EXT_HASH_TREE_INDEXED_DIR	0x00001000 /* GET-ONLY EXT3_INDEX_FL */
-/* 0x2000 reserved for IMAGIC_FL */
-#define EXT_JOURNAL_THIS_FILE	0x00004000 /* GET-ONLY EXT3_JOURNAL_DATA_FL */
-/* 0x8000 reserved for EXT3_NOTAIL_FL */
-#define EXT_SYNCHRONOUS_DIR		0x00010000 /* EXT3_DIRSYNC_FL */
-#define EXT_TOPDIR			0x00020000 /* EXT3_TOPDIR_FL */
-
-#define EXT_SET_MASK			0x000300FF
-#define EXT_GET_MASK			0x0003DFFF
+/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
 
 typedef struct file_chattr_info {
 	__le64	mask; /* list of all possible attribute bits */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8096f27ad9a..ca7980a1e30 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -24,6 +24,7 @@
 
 struct statfs;
 struct smb_vol;
+struct smb_rqst;
 
 /*
  *****************************************************************
@@ -35,27 +36,33 @@ extern struct smb_hdr *cifs_buf_get(void);
 extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
+extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
+					struct kvec *iov);
 extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
 			unsigned int /* length */);
-extern unsigned int _GetXid(void);
-extern void _FreeXid(unsigned int);
-#define GetXid()						\
+extern unsigned int _get_xid(void);
+extern void _free_xid(unsigned int);
+#define get_xid()						\
 ({								\
-	int __xid = (int)_GetXid();				\
-	cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",	\
-	     __func__, __xid, current_fsuid());			\
+	unsigned int __xid = _get_xid();				\
+	cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n",	\
+		 __func__, __xid,					\
+		 from_kuid(&init_user_ns, current_fsuid()));		\
 	__xid;							\
 })
 
-#define FreeXid(curr_xid)					\
+#define free_xid(curr_xid)					\
 do {								\
-	_FreeXid(curr_xid);					\
-	cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",	\
-	     __func__, curr_xid, (int)rc);			\
+	_free_xid(curr_xid);					\
+	cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n",	\
+		 __func__, curr_xid, (int)rc);				\
 } while (0)
+extern int init_cifs_idmap(void);
+extern void exit_cifs_idmap(void);
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
-					struct cifsTconInfo *tcon);
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
+				     struct cifs_sb_info *cifs_sb,
+				     struct cifs_tcon *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
 		const char *fullpath, const struct dfs_info3_param *ref,
@@ -64,372 +71,440 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
 extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
 					struct TCP_Server_Info *server);
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern void cifs_delete_mid(struct mid_q_entry *mid);
+extern void cifs_wake_up_task(struct mid_q_entry *mid);
 extern int cifs_call_async(struct TCP_Server_Info *server,
-			   struct smb_hdr *in_buf, mid_callback_t *callback,
-			   void *cbdata);
-extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
+			struct smb_rqst *rqst,
+			mid_receive_t *receive, mid_callback_t *callback,
+			void *cbdata, const int flags);
+extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
 			struct smb_hdr * /* input */ ,
 			struct smb_hdr * /* out */ ,
-			int * /* bytes returned */ , const int long_op);
-extern int SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
-			struct smb_hdr *in_buf, int flags);
-extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
+			int * /* bytes returned */ , const int);
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+			    char *in_buf, int flags);
+extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *,
+				struct smb_rqst *);
+extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
+						struct smb_rqst *);
+extern int cifs_check_receive(struct mid_q_entry *mid,
+			struct TCP_Server_Info *server, bool log_error);
+extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
 			struct kvec *, int /* nvec to send */,
 			int * /* type of buf returned */ , const int flags);
 extern int SendReceiveBlockingLock(const unsigned int xid,
-			struct cifsTconInfo *ptcon,
+			struct cifs_tcon *ptcon,
 			struct smb_hdr *in_buf ,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
-extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
-extern bool is_valid_oplock_break(struct smb_hdr *smb,
-				  struct TCP_Server_Info *);
+extern int cifs_reconnect(struct TCP_Server_Info *server);
+extern int checkSMB(char *buf, unsigned int length);
+extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
+extern bool backup_cred(struct cifs_sb_info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 			    unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-extern unsigned int smbCalcSize(struct smb_hdr *ptr);
-extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
+extern unsigned int smbCalcSize(void *buf);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
 			struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
-extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
-extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-				const unsigned short int port);
-extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
+extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern int map_smb_to_linux_error(char *buf, bool logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
-			    const struct cifsTconInfo *, int /* length of
+			    const struct cifs_tcon *, int /* length of
 			    fixed section (word count) in two byte units */);
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
-				struct cifsSesInfo *ses,
+				struct cifs_ses *ses,
 				void **request_buf);
-extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-			     const struct nls_table *nls_cp);
-extern __u16 GetNextMid(struct TCP_Server_Info *server);
+extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
+				enum securityEnum requested);
+extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+			  const struct nls_table *nls_cp);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 				      int offset);
 extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+extern int cifs_get_writer(struct cifsInodeInfo *cinode);
+extern void cifs_put_writer(struct cifsInodeInfo *cinode);
+extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
+extern int cifs_unlock_range(struct cifsFileInfo *cfile,
+			     struct file_lock *flock, const unsigned int xid);
+extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
 
-extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
-				struct file *file, struct tcon_link *tlink,
-				__u32 oplock);
-extern int cifs_posix_open(char *full_path, struct inode **pinode,
-				struct super_block *sb,
-				int mode, unsigned int f_flags,
-				__u32 *poplock, __u16 *pnetfid, int xid);
+extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
+					      struct file *file,
+					      struct tcon_link *tlink,
+					      __u32 oplock);
+extern int cifs_posix_open(char *full_path, struct inode **inode,
+			   struct super_block *sb, int mode,
+			   unsigned int f_flags, __u32 *oplock, __u16 *netfid,
+			   unsigned int xid);
 void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
 				     FILE_UNIX_BASIC_INFO *info,
 				     struct cifs_sb_info *cifs_sb);
+extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *,
+					struct cifs_sb_info *);
 extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
 extern struct inode *cifs_iget(struct super_block *sb,
 			       struct cifs_fattr *fattr);
 
-extern int cifs_get_file_info(struct file *filp);
-extern int cifs_get_inode_info(struct inode **pinode,
-			const unsigned char *search_path,
-			FILE_ALL_INFO *pfile_info,
-			struct super_block *sb, int xid, const __u16 *pfid);
-extern int cifs_get_file_info_unix(struct file *filp);
+extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
+			       FILE_ALL_INFO *data, struct super_block *sb,
+			       int xid, const struct cifs_fid *fid);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
-			struct super_block *sb, int xid);
+			struct super_block *sb, unsigned int xid);
+extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs,
+			      unsigned int xid, char *full_path, __u32 dosattr);
+extern int cifs_rename_pending_delete(const char *full_path,
+				      struct dentry *dentry,
+				      const unsigned int xid);
 extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
 			      struct cifs_fattr *fattr, struct inode *inode,
-			      const char *path, const __u16 *pfid);
-extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+			      const char *path, const struct cifs_fid *pfid);
+extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
+					kuid_t, kgid_t);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 					const char *, u32 *);
+extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
+						const struct cifs_fid *, u32 *);
+extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
+				const char *, int);
+
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+		     unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+		struct kvec *iov_orig, unsigned int nr_segs,
+		unsigned int to_read);
+extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+			       struct cifs_sb_info *cifs_sb);
+extern int cifs_match_super(struct super_block *, void *);
+extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
+extern struct smb_vol *cifs_get_volume_info(char *mount_data,
+					    const char *devname);
+extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
+extern void cifs_umount(struct cifs_sb_info *);
+extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
+extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
+				    __u64 length, __u8 type,
+				    struct cifsLockInfo **conf_lock,
+				    int rw_check);
+extern void cifs_add_pending_open(struct cifs_fid *fid,
+				  struct tcon_link *tlink,
+				  struct cifs_pending_open *open);
+extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
+					 struct tcon_link *tlink,
+					 struct cifs_pending_open *open);
+extern void cifs_del_pending_open(struct cifs_pending_open *open);
 
-extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
-			const char *);
-extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
+#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
 extern void cifs_dfs_release_automount_timer(void);
+#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+#define cifs_dfs_release_automount_timer()	do { } while (0)
+#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
 
-extern int cifs_negotiate_protocol(unsigned int xid,
-				  struct cifsSesInfo *ses);
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
-			struct nls_table *nls_info);
-extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
+extern void cifs_move_llist(struct list_head *source, struct list_head *dest);
+extern void cifs_free_llist(struct list_head *llist);
+extern void cifs_del_lock_waiters(struct cifsLockInfo *lock);
 
-extern int CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
-			const char *tree, struct cifsTconInfo *tcon,
-			const struct nls_table *);
+extern int cifs_negotiate_protocol(const unsigned int xid,
+				   struct cifs_ses *ses);
+extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+			      struct nls_table *nls_info);
+extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
+extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
 
-extern int CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
-		const char *searchName, const struct nls_table *nls_codepage,
-		__u16 *searchHandle, struct cifs_search_info *psrch_inf,
-		int map, const char dirsep);
+extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
+		    const char *tree, struct cifs_tcon *tcon,
+		    const struct nls_table *);
 
-extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
-		__u16 searchHandle, struct cifs_search_info *psrch_inf);
+extern int CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
+		const char *searchName, struct cifs_sb_info *cifs_sb,
+		__u16 *searchHandle, __u16 search_flags,
+		struct cifs_search_info *psrch_inf,
+		bool msearch);
 
-extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
+extern int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
+		__u16 searchHandle, __u16 search_flags,
+		struct cifs_search_info *psrch_inf);
+
+extern int CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
 			const __u16 search_handle);
 
-extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			u16 netfid, FILE_ALL_INFO *pFindData);
-extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			FILE_ALL_INFO *findData,
-			int legacy /* whether to use old info level */,
-			const struct nls_table *nls_codepage, int remap);
-extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			FILE_ALL_INFO *findData,
-			const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+			    const char *search_Name, FILE_ALL_INFO *data,
+			    int legacy /* whether to use old info level */,
+			    const struct nls_table *nls_codepage, int remap);
+extern int SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon,
+			       const char *search_name, FILE_ALL_INFO *data,
+			       const struct nls_table *nls_codepage, int remap);
 
-extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
-extern int CIFSSMBUnixQPathInfo(const int xid,
-			struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixQPathInfo(const unsigned int xid,
+			struct cifs_tcon *tcon,
 			const unsigned char *searchName,
 			FILE_UNIX_BASIC_INFO *pFindData,
 			const struct nls_table *nls_codepage, int remap);
 
-extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
-			const unsigned char *searchName,
-			struct dfs_info3_param **target_nodes,
-			unsigned int *number_of_nodes_in_array,
-			const struct nls_table *nls_codepage, int remap);
+extern int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
+			   const char *search_name,
+			   struct dfs_info3_param **target_nodes,
+			   unsigned int *num_of_nodes,
+			   const struct nls_table *nls_codepage, int remap);
 
-extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
+extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
 			const char *old_path,
 			const struct nls_table *nls_codepage,
-			unsigned int *pnum_referrals,
-			struct dfs_info3_param **preferrals,
-			int remap);
-extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
-				 struct super_block *sb, struct smb_vol *vol);
-extern int CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon,
+			unsigned int *num_referrals,
+			struct dfs_info3_param **referrals, int remap);
+extern void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
+				 struct cifs_sb_info *cifs_sb,
+				 struct smb_vol *vol);
+extern int CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			struct kstatfs *FSData);
-extern int SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon,
+extern int SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			struct kstatfs *FSData);
-extern int CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			__u64 cap);
 
-extern int CIFSSMBQFSAttributeInfo(const int xid,
-			struct cifsTconInfo *tcon);
-extern int CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon);
-extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon);
-extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQFSAttributeInfo(const unsigned int xid,
+			struct cifs_tcon *tcon);
+extern int CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon);
+extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			struct kstatfs *FSData);
 
-extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const FILE_BASIC_INFO *data,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			const FILE_BASIC_INFO *data, __u16 fid,
 			__u32 pid_of_opener);
-extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
-			bool delete_file, __u16 fid, __u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const unsigned int xid,
+				     struct cifs_tcon *tcon,
+				     bool delete_file, __u16 fid,
+				     __u32 pid_of_opener);
 #if 0
-extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon,
 			char *fileName, __u16 dos_attributes,
 			const struct nls_table *nls_codepage);
 #endif /* possibly unneeded function */
-extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
-			const char *fileName, __u64 size,
-			bool setAllocationSizeFlag,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
-			 __u64 size, __u16 fileHandle, __u32 opener_pid,
-			bool AllocSizeFlag);
+extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
+			 const char *file_name, __u64 size,
+			 struct cifs_sb_info *cifs_sb, bool set_allocation);
+extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
+			      struct cifsFileInfo *cfile, __u64 size,
+			      bool set_allocation);
 
 struct cifs_unix_set_info_args {
 	__u64	ctime;
 	__u64	atime;
 	__u64	mtime;
 	__u64	mode;
-	__u64	uid;
-	__u64	gid;
+	kuid_t	uid;
+	kgid_t	gid;
 	dev_t	device;
 };
 
-extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixSetFileInfo(const unsigned int xid,
+				  struct cifs_tcon *tcon,
 				  const struct cifs_unix_set_info_args *args,
 				  u16 fid, u32 pid_of_opener);
 
-extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon,
-			char *fileName,
-			const struct cifs_unix_set_info_args *args,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
+extern int CIFSSMBUnixSetPathInfo(const unsigned int xid,
+				  struct cifs_tcon *tcon, const char *file_name,
+				  const struct cifs_unix_set_info_args *args,
+				  const struct nls_table *nls_codepage,
+				  int remap);
 
-extern int CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
-			const char *newName,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon,
-			const char *name, const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *name, struct cifs_sb_info *cifs_sb);
+extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
+			const char *name, struct cifs_sb_info *cifs_sb);
+extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *name, __u16 type,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSSMBDelFile(const int xid, struct cifsTconInfo *tcon,
-			const char *name,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
-			const char *fromName, const char *toName,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-			int netfid, const char *target_name,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSCreateHardLink(const int xid,
-			struct cifsTconInfo *tcon,
-			const char *fromName, const char *toName,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int CIFSUnixCreateHardLink(const int xid,
-			struct cifsTconInfo *tcon,
+extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
+			  const char *name, struct cifs_sb_info *cifs_sb);
+extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+			 const char *from_name, const char *to_name,
+			 struct cifs_sb_info *cifs_sb);
+extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon,
+				 int netfid, const char *target_name,
+				 const struct nls_table *nls_codepage,
+				 int remap_special_chars);
+extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
+			      const char *from_name, const char *to_name,
+			      struct cifs_sb_info *cifs_sb);
+extern int CIFSUnixCreateHardLink(const unsigned int xid,
+			struct cifs_tcon *tcon,
 			const char *fromName, const char *toName,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSUnixCreateSymLink(const int xid,
-			struct cifsTconInfo *tcon,
+extern int CIFSUnixCreateSymLink(const unsigned int xid,
+			struct cifs_tcon *tcon,
 			const char *fromName, const char *toName,
 			const struct nls_table *nls_codepage);
-extern int CIFSSMBUnixQuerySymLink(const int xid,
-			struct cifsTconInfo *tcon,
+extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
+			struct cifs_tcon *tcon,
 			const unsigned char *searchName, char **syminfo,
 			const struct nls_table *nls_codepage);
-extern int CIFSSMBQueryReparseLinkInfo(const int xid,
-			struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			char *symlinkinfo, const int buflen, __u16 fid,
-			const struct nls_table *nls_codepage);
-
-extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+			       __u16 fid, char **symlinkinfo,
+			       const struct nls_table *nls_codepage);
+extern int CIFSSMB_set_compression(const unsigned int xid,
+				   struct cifs_tcon *tcon, __u16 fid);
+extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
+		     int *oplock, FILE_ALL_INFO *buf);
+extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
 			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
 			const struct nls_table *nls_codepage, int remap);
-extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
-			const char *fileName, const int disposition,
-			const int access_flags, const int omode,
-			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
-			const struct nls_table *nls_codepage, int remap);
-extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon,
 			u32 posix_flags, __u64 mode, __u16 *netfid,
 			FILE_UNIX_BASIC_INFO *pRetData,
 			__u32 *pOplock, const char *name,
 			const struct nls_table *nls_codepage, int remap);
-extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon,
 			const int smb_file_id);
 
-extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon,
 			const int smb_file_id);
 
-extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
-			const int netfid, unsigned int count,
-			const __u64 lseek, unsigned int *nbytes, char **buf,
+extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, char **buf,
 			int *return_buf_type);
-extern int CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
-			const int netfid, const unsigned int count,
-			const __u64 lseek, unsigned int *nbytes,
-			const char *buf, const char __user *ubuf,
-			const int long_op);
-extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
-			const int netfid, const unsigned int count,
-			const __u64 offset, unsigned int *nbytes,
-			struct kvec *iov, const int nvec, const int long_op);
-extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName, __u64 *inode_number,
-			const struct nls_table *nls_codepage,
-			int remap_special_chars);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
-			const struct nls_table *cp, int mapChars);
+extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, const char *buf,
+			const char __user *ubuf, const int long_op);
+extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
+			unsigned int *nbytes, struct kvec *iov, const int nvec);
+extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
+				 const char *search_name, __u64 *inode_number,
+				 const struct nls_table *nls_codepage,
+				 int remap);
 
-extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
-			const __u16 netfid, const __u64 len,
+extern int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+		      const __u16 netfid, const __u8 lock_type,
+		      const __u32 num_unlock, const __u32 num_lock,
+		      LOCKING_ANDX_RANGE *buf);
+extern int CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
+			const __u16 netfid, const __u32 netpid, const __u64 len,
 			const __u64 offset, const __u32 numUnlock,
 			const __u32 numLock, const __u8 lockType,
 			const bool waitFlag, const __u8 oplock_level);
-extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
-			const __u16 smb_file_id, const int get_flag,
-			const __u64 len, struct file_lock *,
-			const __u16 lock_type, const bool waitFlag);
-extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
+			const __u16 smb_file_id, const __u32 netpid,
+			const loff_t start_offset, const __u64 len,
+			struct file_lock *, const __u16 lock_type,
+			const bool waitFlag);
+extern int CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon);
 extern int CIFSSMBEcho(struct TCP_Server_Info *server);
-extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
+extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses);
 
-extern struct cifsSesInfo *sesInfoAlloc(void);
-extern void sesInfoFree(struct cifsSesInfo *);
-extern struct cifsTconInfo *tconInfoAlloc(void);
-extern void tconInfoFree(struct cifsTconInfo *);
+extern struct cifs_ses *sesInfoAlloc(void);
+extern void sesInfoFree(struct cifs_ses *);
+extern struct cifs_tcon *tconInfoAlloc(void);
+extern void tconInfoFree(struct cifs_tcon *);
 
-extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
-extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
+extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+		   __u32 *pexpected_response_sequence_number);
+extern int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
-extern int cifs_verify_signature(struct smb_hdr *,
+extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
+extern int cifs_verify_signature(struct smb_rqst *rqst,
 				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-extern int setup_ntlm_response(struct cifsSesInfo *);
-extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
-extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
+			const struct nls_table *);
+extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
+extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
-extern int calc_seckey(struct cifsSesInfo *);
+extern int calc_seckey(struct cifs_ses *);
+extern int generate_smb3signingkey(struct cifs_ses *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-extern void calc_lanman_hash(const char *password, const char *cryptkey,
+extern int calc_lanman_hash(const char *password, const char *cryptkey,
 				bool encrypt, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
-extern int CIFSSMBCopy(int xid,
-			struct cifsTconInfo *source_tcon,
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
+			const int notify_subdirs, const __u16 netfid,
+			__u32 filter, struct file *file, int multishot,
+			const struct nls_table *nls_codepage);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
+extern int CIFSSMBCopy(unsigned int xid,
+			struct cifs_tcon *source_tcon,
 			const char *fromName,
 			const __u16 target_tid,
 			const char *toName, const int flags,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
-			const int notify_subdirs, const __u16 netfid,
-			__u32 filter, struct file *file, int multishot,
-			const struct nls_table *nls_codepage);
-extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
+extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
 			const unsigned char *searchName,
 			const unsigned char *ea_name, char *EAData,
 			size_t bufsize, const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
 		const char *fileName, const char *ea_name,
 		const void *ea_value, const __u16 ea_value_len,
 		const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
 			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
-extern int CIFSSMBSetCIFSACL(const int, struct cifsTconInfo *, __u16,
-			struct cifs_ntsd *, __u32);
-extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
+			struct cifs_ntsd *, __u32, int);
+extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		const unsigned char *searchName,
 		char *acl_inf, const int buflen, const int acl_type,
 		const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		const unsigned char *fileName,
 		const char *local_acl, const int buflen, const int acl_type,
 		const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
-extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
-extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
-		const unsigned char *path,
-		struct cifs_sb_info *cifs_sb, int xid);
+extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
+extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			      struct cifs_sb_info *cifs_sb,
+			      struct cifs_fattr *fattr,
+			      const unsigned char *path);
 extern int mdfour(unsigned char *, unsigned char *, int);
-extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-			unsigned char *p24);
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
+			const struct nls_table *codepage);
+extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 			unsigned char *p24);
+
+void cifs_readdata_release(struct kref *refcount);
+int cifs_async_readv(struct cifs_readdata *rdata);
+int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
+
+int cifs_async_writev(struct cifs_writedata *wdata,
+		      void (*release)(struct kref *kref));
+void cifs_writev_complete(struct work_struct *work);
+struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
+						work_func_t complete);
+void cifs_writedata_release(struct kref *refcount);
+int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb,
+			  const unsigned char *path, char *pbuf,
+			  unsigned int *pbytes_read);
+int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+			   struct cifs_sb_info *cifs_sb,
+			   const unsigned char *path, char *pbuf,
+			   unsigned int *pbytes_written);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 904aa47e351..6ce4e0954b9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -32,6 +32,9 @@
 #include <linux/vfs.h>
 #include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -39,6 +42,7 @@
 #include "cifsproto.h"
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
+#include "fscache.h"
 
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -82,32 +86,37 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
 
-/* Mark as invalid, all open files on tree connections since they
-   were closed when session to server was lost */
-static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
+/*
+ * Mark as invalid, all open files on tree connections since they
+ * were closed when session to server was lost.
+ */
+void
+cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
 {
 	struct cifsFileInfo *open_file = NULL;
 	struct list_head *tmp;
 	struct list_head *tmp1;
 
-/* list all files open on tree connection and mark them invalid */
+	/* list all files open on tree connection and mark them invalid */
 	spin_lock(&cifs_file_list_lock);
-	list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
+	list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
 		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
 		open_file->invalidHandle = true;
 		open_file->oplock_break_cancelled = true;
 	}
 	spin_unlock(&cifs_file_list_lock);
-	/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
-	   to this tcon */
+	/*
+	 * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+	 * to this tcon.
+	 */
 }
 
 /* reconnect the socket, tcon, and smb session if needed */
 static int
-cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
+cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 {
-	int rc = 0;
-	struct cifsSesInfo *ses;
+	int rc;
+	struct cifs_ses *ses;
 	struct TCP_Server_Info *server;
 	struct nls_table *nls_codepage;
 
@@ -130,8 +139,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		if (smb_command != SMB_COM_WRITE_ANDX &&
 		    smb_command != SMB_COM_OPEN_ANDX &&
 		    smb_command != SMB_COM_TREE_DISCONNECT) {
-			cFYI(1, "can not send cmd %d while umounting",
-				smb_command);
+			cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+				 smb_command);
 			return -ENODEV;
 		}
 	}
@@ -142,9 +151,9 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 	 */
 	while (server->tcpStatus == CifsNeedReconnect) {
 		wait_event_interruptible_timeout(server->response_q,
-			(server->tcpStatus == CifsGood), 10 * HZ);
+			(server->tcpStatus != CifsNeedReconnect), 10 * HZ);
 
-		/* is TCP session is reestablished now ?*/
+		/* are we still trying to reconnect? */
 		if (server->tcpStatus != CifsNeedReconnect)
 			break;
 
@@ -154,7 +163,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		 * back on-line
 		 */
 		if (!tcon->retry) {
-			cFYI(1, "gave up waiting on reconnect in smb_init");
+			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
 			return -EHOSTDOWN;
 		}
 	}
@@ -179,10 +188,10 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		goto out;
 	}
 
-	mark_open_files_invalid(tcon);
+	cifs_mark_open_files_invalid(tcon);
 	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
 	mutex_unlock(&ses->session_mutex);
-	cFYI(1, "reconnect tcon rc = %d", rc);
+	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
 
 	if (rc)
 		goto out;
@@ -226,7 +235,7 @@ out:
    SMB information in the SMB header.  If the return code is zero, this
    function must have filled in request_buf pointer */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+small_smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
 		void **request_buf)
 {
 	int rc;
@@ -252,7 +261,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 
 int
 small_smb_init_no_tc(const int smb_command, const int wct,
-		     struct cifsSesInfo *ses, void **request_buf)
+		     struct cifs_ses *ses, void **request_buf)
 {
 	int rc;
 	struct smb_hdr *buffer;
@@ -262,7 +271,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 		return rc;
 
 	buffer = (struct smb_hdr *)*request_buf;
-	buffer->Mid = GetNextMid(ses->server);
+	buffer->Mid = get_next_mid(ses->server);
 	if (ses->capabilities & CAP_UNICODE)
 		buffer->Flags2 |= SMBFLG2_UNICODE;
 	if (ses->capabilities & CAP_STATUS32)
@@ -278,7 +287,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+__smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
 			void **request_buf, void **response_buf)
 {
 	*request_buf = cifs_buf_get();
@@ -304,7 +313,7 @@ __smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+smb_init(int smb_command, int wct, struct cifs_tcon *tcon,
 	 void **request_buf, void **response_buf)
 {
 	int rc;
@@ -317,7 +326,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 }
 
 static int
-smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+smb_init_no_reconnect(int smb_command, int wct, struct cifs_tcon *tcon,
 			void **request_buf, void **response_buf)
 {
 	if (tcon->ses->need_reconnect || tcon->need_reconnect)
@@ -339,12 +348,13 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
 	    get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
 		goto vt2_err;
 
-	/* check that bcc is at least as big as parms + data */
-	/* check that bcc is less than negotiated smb buffer */
 	total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
 	if (total_size >= 512)
 		goto vt2_err;
 
+	/* check that bcc is at least as big as parms + data, and that it is
+	 * less than negotiated smb buffer
+	 */
 	total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
 	if (total_size > get_bcc(&pSMB->hdr) ||
 	    total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
@@ -357,49 +367,210 @@ vt2_err:
 	return -EINVAL;
 }
 
+static int
+decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
+{
+	int	rc = 0;
+	u16	count;
+	char	*guid = pSMBr->u.extended_response.GUID;
+	struct TCP_Server_Info *server = ses->server;
+
+	count = get_bcc(&pSMBr->hdr);
+	if (count < SMB1_CLIENT_GUID_SIZE)
+		return -EIO;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if (server->srv_count > 1) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
+			cifs_dbg(FYI, "server UID changed\n");
+			memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+		}
+	} else {
+		spin_unlock(&cifs_tcp_ses_lock);
+		memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+	}
+
+	if (count == SMB1_CLIENT_GUID_SIZE) {
+		server->sec_ntlmssp = true;
+	} else {
+		count -= SMB1_CLIENT_GUID_SIZE;
+		rc = decode_negTokenInit(
+			pSMBr->u.extended_response.SecurityBlob, count, server);
+		if (rc != 1)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+	bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+	bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+	bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
+
+	/*
+	 * Is signing required by mnt options? If not then check
+	 * global_secflags to see if it is there.
+	 */
+	if (!mnt_sign_required)
+		mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+						CIFSSEC_MUST_SIGN);
+
+	/*
+	 * If signing is required then it's automatically enabled too,
+	 * otherwise, check to see if the secflags allow it.
+	 */
+	mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+				(global_secflags & CIFSSEC_MAY_SIGN);
+
+	/* If server requires signing, does client allow it? */
+	if (srv_sign_required) {
+		if (!mnt_sign_enabled) {
+			cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
+			return -ENOTSUPP;
+		}
+		server->sign = true;
+	}
+
+	/* If client requires signing, does server allow it? */
+	if (mnt_sign_required) {
+		if (!srv_sign_enabled) {
+			cifs_dbg(VFS, "Server does not support signing!");
+			return -ENOTSUPP;
+		}
+		server->sign = true;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+	__s16 tmp;
+	struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+
+	if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
+		return -EOPNOTSUPP;
+
+	server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+	server->maxReq = min_t(unsigned int,
+			       le16_to_cpu(rsp->MaxMpxCount),
+			       cifs_max_pending);
+	set_credits(server, server->maxReq);
+	server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+	/* even though we do not use raw we might as well set this
+	accurately, in case we ever find a need for it */
+	if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+		server->max_rw = 0xFF00;
+		server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+	} else {
+		server->max_rw = 0;/* do not need to use raw anyway */
+		server->capabilities = CAP_MPX_MODE;
+	}
+	tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+	if (tmp == -1) {
+		/* OS/2 often does not set timezone therefore
+		 * we must use server time to calc time zone.
+		 * Could deviate slightly from the right zone.
+		 * Smallest defined timezone difference is 15 minutes
+		 * (i.e. Nepal).  Rounding up/down is done to match
+		 * this requirement.
+		 */
+		int val, seconds, remain, result;
+		struct timespec ts, utc;
+		utc = CURRENT_TIME;
+		ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+				    rsp->SrvTime.Time, 0);
+		cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
+			 (int)ts.tv_sec, (int)utc.tv_sec,
+			 (int)(utc.tv_sec - ts.tv_sec));
+		val = (int)(utc.tv_sec - ts.tv_sec);
+		seconds = abs(val);
+		result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+		remain = seconds % MIN_TZ_ADJ;
+		if (remain >= (MIN_TZ_ADJ / 2))
+			result += MIN_TZ_ADJ;
+		if (val < 0)
+			result = -result;
+		server->timeAdj = result;
+	} else {
+		server->timeAdj = (int)tmp;
+		server->timeAdj *= 60; /* also in seconds */
+	}
+	cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
+
+
+	/* BB get server time for time conversions and add
+	code to use it and timezone since this is not UTC */
+
+	if (rsp->EncryptionKeyLength ==
+			cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+		memcpy(server->cryptkey, rsp->EncryptionKey,
+			CIFS_CRYPTO_KEY_SIZE);
+	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+		return -EIO; /* need cryptkey unless plain text */
+	}
+
+	cifs_dbg(FYI, "LANMAN negotiated\n");
+	return 0;
+}
+#else
+static inline int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+	cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+	return -EOPNOTSUPP;
+}
+#endif
+
+static bool
+should_set_ext_sec_flag(enum securityEnum sectype)
+{
+	switch (sectype) {
+	case RawNTLMSSP:
+	case Kerberos:
+		return true;
+	case Unspecified:
+		if (global_secflags &
+		    (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
+			return true;
+		/* Fallthrough */
+	default:
+		return false;
+	}
+}
+
 int
-CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
+CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 {
 	NEGOTIATE_REQ *pSMB;
 	NEGOTIATE_RSP *pSMBr;
 	int rc = 0;
 	int bytes_returned;
 	int i;
-	struct TCP_Server_Info *server;
+	struct TCP_Server_Info *server = ses->server;
 	u16 count;
-	unsigned int secFlags;
 
-	if (ses->server)
-		server = ses->server;
-	else {
-		rc = -EIO;
-		return rc;
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
 	}
+
 	rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
 		      (void **) &pSMB, (void **) &pSMBr);
 	if (rc)
 		return rc;
 
-	/* if any of auth flags (ie not sign or seal) are overriden use them */
-	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-		secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
-	else /* if override flags set only sign/seal OR them with global auth */
-		secFlags = global_secflags | ses->overrideSecFlg;
-
-	cFYI(1, "secFlags 0x%x", secFlags);
-
-	pSMB->hdr.Mid = GetNextMid(server);
+	pSMB->hdr.Mid = get_next_mid(server);
 	pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
 
-	if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-		cFYI(1, "Kerberos only mechanism, enable extended security");
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	} else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-		cFYI(1, "NTLMSSP only mechanism, enable extended security");
+	if (should_set_ext_sec_flag(ses->sectype)) {
+		cifs_dbg(FYI, "Requesting extended security.");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
 
@@ -409,7 +580,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		count += strlen(protocols[i].name) + 1;
 		/* null at end of source and target buffers anyway */
 	}
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -418,7 +589,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		goto neg_err_exit;
 
 	server->dialect = le16_to_cpu(pSMBr->DialectIndex);
-	cFYI(1, "Dialect: %d", server->dialect);
+	cifs_dbg(FYI, "Dialect: %d\n", server->dialect);
 	/* Check wct = 1 error case */
 	if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
 		/* core returns wct = 1, but we do not ask for core - otherwise
@@ -426,248 +597,67 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		could not negotiate a common dialect */
 		rc = -EOPNOTSUPP;
 		goto neg_err_exit;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-	} else if ((pSMBr->hdr.WordCount == 13)
-			&& ((server->dialect == LANMAN_PROT)
-				|| (server->dialect == LANMAN2_PROT))) {
-		__s16 tmp;
-		struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
-
-		if ((secFlags & CIFSSEC_MAY_LANMAN) ||
-			(secFlags & CIFSSEC_MAY_PLNTXT))
-			server->secType = LANMAN;
-		else {
-			cERROR(1, "mount failed weak security disabled"
-				   " in /proc/fs/cifs/SecurityFlags");
-			rc = -EOPNOTSUPP;
-			goto neg_err_exit;
-		}
-		server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
-		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
-		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
-				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-		/* even though we do not use raw we might as well set this
-		accurately, in case we ever find a need for it */
-		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-			server->max_rw = 0xFF00;
-			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
-		} else {
-			server->max_rw = 0;/* do not need to use raw anyway */
-			server->capabilities = CAP_MPX_MODE;
-		}
-		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
-		if (tmp == -1) {
-			/* OS/2 often does not set timezone therefore
-			 * we must use server time to calc time zone.
-			 * Could deviate slightly from the right zone.
-			 * Smallest defined timezone difference is 15 minutes
-			 * (i.e. Nepal).  Rounding up/down is done to match
-			 * this requirement.
-			 */
-			int val, seconds, remain, result;
-			struct timespec ts, utc;
-			utc = CURRENT_TIME;
-			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
-					    rsp->SrvTime.Time, 0);
-			cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
-				(int)ts.tv_sec, (int)utc.tv_sec,
-				(int)(utc.tv_sec - ts.tv_sec));
-			val = (int)(utc.tv_sec - ts.tv_sec);
-			seconds = abs(val);
-			result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
-			remain = seconds % MIN_TZ_ADJ;
-			if (remain >= (MIN_TZ_ADJ / 2))
-				result += MIN_TZ_ADJ;
-			if (val < 0)
-				result = -result;
-			server->timeAdj = result;
-		} else {
-			server->timeAdj = (int)tmp;
-			server->timeAdj *= 60; /* also in seconds */
-		}
-		cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
-
-
-		/* BB get server time for time conversions and add
-		code to use it and timezone since this is not UTC */
-
-		if (rsp->EncryptionKeyLength ==
-				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-			memcpy(ses->server->cryptkey, rsp->EncryptionKey,
-				CIFS_CRYPTO_KEY_SIZE);
-		} else if (server->secMode & SECMODE_PW_ENCRYPT) {
-			rc = -EIO; /* need cryptkey unless plain text */
-			goto neg_err_exit;
-		}
-
-		cFYI(1, "LANMAN negotiated");
-		/* we will not end up setting signing flags - as no signing
-		was in LANMAN and server did not return the flags on */
-		goto signing_check;
-#else /* weak security disabled */
 	} else if (pSMBr->hdr.WordCount == 13) {
-		cERROR(1, "mount failed, cifs module not built "
-			  "with CIFS_WEAK_PW_HASH support");
-		rc = -EOPNOTSUPP;
-#endif /* WEAK_PW_HASH */
-		goto neg_err_exit;
+		server->negflavor = CIFS_NEGFLAVOR_LANMAN;
+		rc = decode_lanman_negprot_rsp(server, pSMBr);
+		goto signing_check;
 	} else if (pSMBr->hdr.WordCount != 17) {
 		/* unknown wct */
 		rc = -EOPNOTSUPP;
 		goto neg_err_exit;
 	}
-	/* else wct == 17 NTLM */
-	server->secMode = pSMBr->SecurityMode;
-	if ((server->secMode & SECMODE_USER) == 0)
-		cFYI(1, "share mode security");
+	/* else wct == 17, NTLM or better */
 
-	if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-		if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
-#endif /* CIFS_WEAK_PW_HASH */
-			cERROR(1, "Server requests plain text password"
-				  " but client support disabled");
-
-	if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
-		server->secType = NTLMv2;
-	else if (secFlags & CIFSSEC_MAY_NTLM)
-		server->secType = NTLM;
-	else if (secFlags & CIFSSEC_MAY_NTLMV2)
-		server->secType = NTLMv2;
-	else if (secFlags & CIFSSEC_MAY_KRB5)
-		server->secType = Kerberos;
-	else if (secFlags & CIFSSEC_MAY_NTLMSSP)
-		server->secType = RawNTLMSSP;
-	else if (secFlags & CIFSSEC_MAY_LANMAN)
-		server->secType = LANMAN;
-/* #ifdef CONFIG_CIFS_EXPERIMENTAL
-	else if (secFlags & CIFSSEC_MAY_PLNTXT)
-		server->secType = ??
-#endif */
-	else {
-		rc = -EOPNOTSUPP;
-		cERROR(1, "Invalid security type");
-		goto neg_err_exit;
-	}
-	/* else ... any others ...? */
+	server->sec_mode = pSMBr->SecurityMode;
+	if ((server->sec_mode & SECMODE_USER) == 0)
+		cifs_dbg(FYI, "share mode security\n");
 
 	/* one byte, so no need to convert this or EncryptionKeyLen from
 	   little endian */
-	server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+	server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
+			       cifs_max_pending);
+	set_credits(server, server->maxReq);
 	/* probably no need to store and check maxvcs */
-	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
-			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+	server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
+	cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
+
 	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
 		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
-	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
-			&& (pSMBr->EncryptionKeyLength == 0)) {
-		/* decode security blob */
-	} else if (server->secMode & SECMODE_PW_ENCRYPT) {
+	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
+			server->capabilities & CAP_EXTENDED_SECURITY) &&
+				(pSMBr->EncryptionKeyLength == 0)) {
+		server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+		rc = decode_ext_sec_blob(ses, pSMBr);
+	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
 		rc = -EIO; /* no crypt key only if plain text pwd */
-		goto neg_err_exit;
-	}
-
-	/* BB might be helpful to save off the domain of server here */
-
-	if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) &&
-		(server->capabilities & CAP_EXTENDED_SECURITY)) {
-		count = pSMBr->ByteCount;
-		if (count < 16) {
-			rc = -EIO;
-			goto neg_err_exit;
-		}
-		spin_lock(&cifs_tcp_ses_lock);
-		if (server->srv_count > 1) {
-			spin_unlock(&cifs_tcp_ses_lock);
-			if (memcmp(server->server_GUID,
-				   pSMBr->u.extended_response.
-				   GUID, 16) != 0) {
-				cFYI(1, "server UID changed");
-				memcpy(server->server_GUID,
-					pSMBr->u.extended_response.GUID,
-					16);
-			}
-		} else {
-			spin_unlock(&cifs_tcp_ses_lock);
-			memcpy(server->server_GUID,
-			       pSMBr->u.extended_response.GUID, 16);
-		}
-
-		if (count == 16) {
-			server->secType = RawNTLMSSP;
-		} else {
-			rc = decode_negTokenInit(pSMBr->u.extended_response.
-						 SecurityBlob, count - 16,
-						 server);
-			if (rc == 1)
-				rc = 0;
-			else
-				rc = -EINVAL;
-			if (server->secType == Kerberos) {
-				if (!server->sec_kerberos &&
-						!server->sec_mskerberos)
-					rc = -EOPNOTSUPP;
-			} else if (server->secType == RawNTLMSSP) {
-				if (!server->sec_ntlmssp)
-					rc = -EOPNOTSUPP;
-			} else
-					rc = -EOPNOTSUPP;
-		}
-	} else
-		server->capabilities &= ~CAP_EXTENDED_SECURITY;
-
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-signing_check:
-#endif
-	if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
-		/* MUST_SIGN already includes the MAY_SIGN FLAG
-		   so if this is zero it means that signing is disabled */
-		cFYI(1, "Signing disabled");
-		if (server->secMode & SECMODE_SIGN_REQUIRED) {
-			cERROR(1, "Server requires "
-				   "packet signing to be enabled in "
-				   "/proc/fs/cifs/SecurityFlags.");
-			rc = -EOPNOTSUPP;
-		}
-		server->secMode &=
-			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-	} else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
-		/* signing required */
-		cFYI(1, "Must sign - secFlags 0x%x", secFlags);
-		if ((server->secMode &
-			(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-			cERROR(1, "signing required but server lacks support");
-			rc = -EOPNOTSUPP;
-		} else
-			server->secMode |= SECMODE_SIGN_REQUIRED;
 	} else {
-		/* signing optional ie CIFSSEC_MAY_SIGN */
-		if ((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
-			server->secMode &=
-				~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
+		server->capabilities &= ~CAP_EXTENDED_SECURITY;
 	}
 
+signing_check:
+	if (!rc)
+		rc = cifs_enable_signing(server, ses->sign);
 neg_err_exit:
 	cifs_buf_release(pSMB);
 
-	cFYI(1, "negprot rc %d", rc);
+	cifs_dbg(FYI, "negprot rc %d\n", rc);
 	return rc;
 }
 
 int
-CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
 {
 	struct smb_hdr *smb_buffer;
 	int rc = 0;
 
-	cFYI(1, "In tree disconnect");
+	cifs_dbg(FYI, "In tree disconnect\n");
 
 	/* BB: do we need to check this? These should never be NULL. */
 	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -687,9 +677,9 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	if (rc)
 		return rc;
 
-	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0);
 	if (rc)
-		cFYI(1, "Tree disconnect failed %d", rc);
+		cifs_dbg(FYI, "Tree disconnect failed %d\n", rc);
 
 	/* No need to return error on this operation if tid invalidated and
 	   closed on server already e.g. due to tcp session crashing */
@@ -712,8 +702,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
 	struct TCP_Server_Info *server = mid->callback_data;
 
 	DeleteMidQEntry(mid);
-	atomic_dec(&server->inFlight);
-	wake_up(&server->request_q);
+	add_credits(server, 1, CIFS_ECHO_OP);
 }
 
 int
@@ -721,25 +710,30 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 {
 	ECHO_REQ *smb;
 	int rc = 0;
+	struct kvec iov;
+	struct smb_rqst rqst = { .rq_iov = &iov,
+				 .rq_nvec = 1 };
 
-	cFYI(1, "In echo request");
+	cifs_dbg(FYI, "In echo request\n");
 
 	rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
 	if (rc)
 		return rc;
 
 	/* set up echo request */
-	smb->hdr.Tid = cpu_to_le16(0xffff);
+	smb->hdr.Tid = 0xffff;
 	smb->hdr.WordCount = 1;
 	put_unaligned_le16(1, &smb->EchoCount);
-	put_bcc_le(1, &smb->hdr);
+	put_bcc(1, &smb->hdr);
 	smb->Data[0] = 'a';
-	smb->hdr.smb_buf_length += 3;
+	inc_rfc1001_len(smb, 3);
+	iov.iov_base = smb;
+	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
 
-	rc = cifs_call_async(server, (struct smb_hdr *)smb,
-				cifs_echo_callback, server);
+	rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback,
+			     server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
 	if (rc)
-		cFYI(1, "Echo request failed: %d", rc);
+		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
 
 	cifs_small_buf_release(smb);
 
@@ -747,12 +741,12 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 }
 
 int
-CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
+CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
 {
 	LOGOFF_ANDX_REQ *pSMB;
 	int rc = 0;
 
-	cFYI(1, "In SMBLogoff for session disconnect");
+	cifs_dbg(FYI, "In SMBLogoff for session disconnect\n");
 
 	/*
 	 * BB: do we need to check validity of ses and server? They should
@@ -772,16 +766,15 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 		return rc;
 	}
 
-	pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Mid = get_next_mid(ses->server);
 
-	if (ses->server->secMode &
-		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+	if (ses->server->sign)
+		pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	pSMB->hdr.Uid = ses->Suid;
 
 	pSMB->AndXCommand = 0xFF;
-	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
+	rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0);
 session_already_dead:
 	mutex_unlock(&ses->session_mutex);
 
@@ -794,8 +787,9 @@ session_already_dead:
 }
 
 int
-CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
-		 __u16 type, const struct nls_table *nls_codepage, int remap)
+CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
+		 const char *fileName, __u16 type,
+		 const struct nls_table *nls_codepage, int remap)
 {
 	TRANSACTION2_SPI_REQ *pSMB = NULL;
 	TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -805,7 +799,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In POSIX delete");
+	cifs_dbg(FYI, "In POSIX delete\n");
 PsxDelete:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -814,8 +808,8 @@ PsxDelete:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else { /* BB add path length overrun check */
@@ -852,15 +846,15 @@ PsxDelete:
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "Posix delete returned %d", rc);
+		cifs_dbg(FYI, "Posix delete returned %d\n", rc);
 	cifs_buf_release(pSMB);
 
-	cifs_stats_inc(&tcon->num_deletes);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
 
 	if (rc == -EAGAIN)
 		goto PsxDelete;
@@ -869,14 +863,15 @@ PsxDelete:
 }
 
 int
-CIFSSMBDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
-	       const struct nls_table *nls_codepage, int remap)
+CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	       struct cifs_sb_info *cifs_sb)
 {
 	DELETE_FILE_REQ *pSMB = NULL;
 	DELETE_FILE_RSP *pSMBr = NULL;
 	int rc = 0;
 	int bytes_returned;
 	int name_len;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
 DelFileRetry:
 	rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB,
@@ -885,26 +880,26 @@ DelFileRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->fileName, name,
+					      PATH_MAX, cifs_sb->local_nls,
+					      remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
+		name_len = strnlen(name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->fileName, fileName, name_len);
+		strncpy(pSMB->fileName, name, name_len);
 	}
 	pSMB->SearchAttributes =
 	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_deletes);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
 	if (rc)
-		cFYI(1, "Error in RMFile = %d", rc);
+		cifs_dbg(FYI, "Error in RMFile = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -914,16 +909,17 @@ DelFileRetry:
 }
 
 int
-CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
-	     const struct nls_table *nls_codepage, int remap)
+CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	     struct cifs_sb_info *cifs_sb)
 {
 	DELETE_DIRECTORY_REQ *pSMB = NULL;
 	DELETE_DIRECTORY_RSP *pSMBr = NULL;
 	int rc = 0;
 	int bytes_returned;
 	int name_len;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSSMBRmDir");
+	cifs_dbg(FYI, "In CIFSSMBRmDir\n");
 RmDirRetry:
 	rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -931,24 +927,25 @@ RmDirRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
-					 PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+					      PATH_MAX, cifs_sb->local_nls,
+					      remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
-		name_len = strnlen(dirName, PATH_MAX);
+		name_len = strnlen(name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->DirName, dirName, name_len);
+		strncpy(pSMB->DirName, name, name_len);
 	}
 
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_rmdirs);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs);
 	if (rc)
-		cFYI(1, "Error in RMDir = %d", rc);
+		cifs_dbg(FYI, "Error in RMDir = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -957,16 +954,17 @@ RmDirRetry:
 }
 
 int
-CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
-	     const char *name, const struct nls_table *nls_codepage, int remap)
+CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	     struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
 	CREATE_DIRECTORY_REQ *pSMB = NULL;
 	CREATE_DIRECTORY_RSP *pSMBr = NULL;
 	int bytes_returned;
 	int name_len;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSSMBMkDir");
+	cifs_dbg(FYI, "In CIFSSMBMkDir\n");
 MkDirRetry:
 	rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -974,8 +972,9 @@ MkDirRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
-					    PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+					      PATH_MAX, cifs_sb->local_nls,
+					      remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {		/* BB improve check for buffer overruns BB */
@@ -985,13 +984,13 @@ MkDirRetry:
 	}
 
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_mkdirs);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs);
 	if (rc)
-		cFYI(1, "Error in Mkdir = %d", rc);
+		cifs_dbg(FYI, "Error in Mkdir = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -1000,10 +999,11 @@ MkDirRetry:
 }
 
 int
-CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
-		__u64 mode, __u16 *netfid, FILE_UNIX_BASIC_INFO *pRetData,
-		__u32 *pOplock, const char *name,
-		const struct nls_table *nls_codepage, int remap)
+CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon,
+		__u32 posix_flags, __u64 mode, __u16 *netfid,
+		FILE_UNIX_BASIC_INFO *pRetData, __u32 *pOplock,
+		const char *name, const struct nls_table *nls_codepage,
+		int remap)
 {
 	TRANSACTION2_SPI_REQ *pSMB = NULL;
 	TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -1014,7 +1014,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
 	OPEN_PSX_REQ *pdata;
 	OPEN_PSX_RSP *psx_rsp;
 
-	cFYI(1, "In POSIX Create");
+	cifs_dbg(FYI, "In POSIX Create\n");
 PsxCreat:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1023,8 +1023,8 @@ PsxCreat:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -1063,19 +1063,19 @@ PsxCreat:
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Posix create returned %d", rc);
+		cifs_dbg(FYI, "Posix create returned %d\n", rc);
 		goto psx_create_err;
 	}
 
-	cFYI(1, "copying inode info");
+	cifs_dbg(FYI, "copying inode info\n");
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-	if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+	if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
 		rc = -EIO;	/* bad smb */
 		goto psx_create_err;
 	}
@@ -1094,11 +1094,11 @@ PsxCreat:
 	/* check to make sure response data is there */
 	if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
 		pRetData->Type = cpu_to_le32(-1); /* unknown */
-		cFYI(DBG2, "unknown type");
+		cifs_dbg(NOISY, "unknown type\n");
 	} else {
-		if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
+		if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
 					+ sizeof(FILE_UNIX_BASIC_INFO)) {
-			cERROR(1, "Open response data too small");
+			cifs_dbg(VFS, "Open response data too small\n");
 			pRetData->Type = cpu_to_le32(-1);
 			goto psx_create_err;
 		}
@@ -1111,9 +1111,9 @@ psx_create_err:
 	cifs_buf_release(pSMB);
 
 	if (posix_flags & SMB_O_DIRECTORY)
-		cifs_stats_inc(&tcon->num_posixmkdirs);
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_posixmkdirs);
 	else
-		cifs_stats_inc(&tcon->num_posixopens);
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_posixopens);
 
 	if (rc == -EAGAIN)
 		goto PsxCreat;
@@ -1145,7 +1145,7 @@ static __u16 convert_disposition(int disposition)
 			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
 			break;
 		default:
-			cFYI(1, "unknown disposition %d", disposition);
+			cifs_dbg(FYI, "unknown disposition %d\n", disposition);
 			ofun =  SMBOPEN_OAPPEND; /* regular open */
 	}
 	return ofun;
@@ -1166,7 +1166,7 @@ access_flags_to_smbopen_mode(const int access_flags)
 }
 
 int
-SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
+SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
 	    const char *fileName, const int openDisposition,
 	    const int access_flags, const int create_options, __u16 *netfid,
 	    int *pOplock, FILE_ALL_INFO *pfile_info,
@@ -1190,8 +1190,8 @@ OldOpenRetry:
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		count = 1;      /* account for one byte pad to word boundary */
 		name_len =
-		   cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
-				    fileName, PATH_MAX, nls_codepage, remap);
+		   cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+				      fileName, PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {                /* BB improve check for buffer overruns BB */
@@ -1228,15 +1228,15 @@ OldOpenRetry:
 	pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY);
 	pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition));
 	count += name_len;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_opens);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
 	if (rc) {
-		cFYI(1, "Error in Open = %d", rc);
+		cifs_dbg(FYI, "Error in Open = %d\n", rc);
 	} else {
 	/* BB verify if wct == 15 */
 
@@ -1273,111 +1273,377 @@ OldOpenRetry:
 }
 
 int
-CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
-	    const char *fileName, const int openDisposition,
-	    const int access_flags, const int create_options, __u16 *netfid,
-	    int *pOplock, FILE_ALL_INFO *pfile_info,
-	    const struct nls_table *nls_codepage, int remap)
+CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
+	  FILE_ALL_INFO *buf)
 {
 	int rc = -EACCES;
-	OPEN_REQ *pSMB = NULL;
-	OPEN_RSP *pSMBr = NULL;
+	OPEN_REQ *req = NULL;
+	OPEN_RSP *rsp = NULL;
 	int bytes_returned;
 	int name_len;
 	__u16 count;
+	struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
+	struct cifs_tcon *tcon = oparms->tcon;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+	const struct nls_table *nls = cifs_sb->local_nls;
+	int create_options = oparms->create_options;
+	int desired_access = oparms->desired_access;
+	int disposition = oparms->disposition;
+	const char *path = oparms->path;
 
 openRetry:
-	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
+	rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
+		      (void **)&rsp);
 	if (rc)
 		return rc;
 
-	pSMB->AndXCommand = 0xFF;	/* none */
+	/* no commands go after this */
+	req->AndXCommand = 0xFF;
 
-	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		count = 1;	/* account for one byte pad to word boundary */
-		name_len =
-		    cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
-				     fileName, PATH_MAX, nls_codepage, remap);
-		name_len++;	/* trailing null */
+	if (req->hdr.Flags2 & SMBFLG2_UNICODE) {
+		/* account for one byte pad to word boundary */
+		count = 1;
+		name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1),
+					      path, PATH_MAX, nls, remap);
+		/* trailing null */
+		name_len++;
 		name_len *= 2;
-		pSMB->NameLength = cpu_to_le16(name_len);
-	} else {		/* BB improve check for buffer overruns BB */
-		count = 0;	/* no pad */
-		name_len = strnlen(fileName, PATH_MAX);
-		name_len++;	/* trailing null */
-		pSMB->NameLength = cpu_to_le16(name_len);
-		strncpy(pSMB->fileName, fileName, name_len);
+		req->NameLength = cpu_to_le16(name_len);
+	} else {
+		/* BB improve check for buffer overruns BB */
+		/* no pad */
+		count = 0;
+		name_len = strnlen(path, PATH_MAX);
+		/* trailing null */
+		name_len++;
+		req->NameLength = cpu_to_le16(name_len);
+		strncpy(req->fileName, path, name_len);
 	}
-	if (*pOplock & REQ_OPLOCK)
-		pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK);
-	else if (*pOplock & REQ_BATCHOPLOCK)
-		pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
-	pSMB->DesiredAccess = cpu_to_le32(access_flags);
-	pSMB->AllocationSize = 0;
-	/* set file as system file if special file such
-	   as fifo and server expecting SFU style and
-	   no Unix extensions */
+
+	if (*oplock & REQ_OPLOCK)
+		req->OpenFlags = cpu_to_le32(REQ_OPLOCK);
+	else if (*oplock & REQ_BATCHOPLOCK)
+		req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
+
+	req->DesiredAccess = cpu_to_le32(desired_access);
+	req->AllocationSize = 0;
+
+	/*
+	 * Set file as system file if special file such as fifo and server
+	 * expecting SFU style and no Unix extensions.
+	 */
 	if (create_options & CREATE_OPTION_SPECIAL)
-		pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
+		req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
 	else
-		pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+		req->FileAttributes = cpu_to_le32(ATTR_NORMAL);
 
-	/* XP does not handle ATTR_POSIX_SEMANTICS */
-	/* but it helps speed up case sensitive checks for other
-	servers such as Samba */
+	/*
+	 * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case
+	 * sensitive checks for other servers such as Samba.
+	 */
 	if (tcon->ses->capabilities & CAP_UNIX)
-		pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
+		req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
 
 	if (create_options & CREATE_OPTION_READONLY)
-		pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+		req->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+
+	req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
+	req->CreateDisposition = cpu_to_le32(disposition);
+	req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
 
-	pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
-	pSMB->CreateDisposition = cpu_to_le32(openDisposition);
-	pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
 	/* BB Expirement with various impersonation levels and verify */
-	pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
-	pSMB->SecurityFlags =
-	    SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
+	req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
+	req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY;
 
 	count += name_len;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(req, count);
 
-	pSMB->ByteCount = cpu_to_le16(count);
-	/* long_op set to 1 to allow for oplock break timeouts */
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_opens);
+	req->ByteCount = cpu_to_le16(count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req,
+			 (struct smb_hdr *)rsp, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
 	if (rc) {
-		cFYI(1, "Error in Open = %d", rc);
-	} else {
-		*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
-		*netfid = pSMBr->Fid;	/* cifs fid stays in le */
-		/* Let caller know file was created so we can set the mode. */
-		/* Do we care about the CreateAction in any other cases? */
-		if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
-			*pOplock |= CIFS_CREATE_ACTION;
-		if (pfile_info) {
-			memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
-				36 /* CreationTime to Attributes */);
-			/* the file_info buf is endian converted by caller */
-			pfile_info->AllocationSize = pSMBr->AllocationSize;
-			pfile_info->EndOfFile = pSMBr->EndOfFile;
-			pfile_info->NumberOfLinks = cpu_to_le32(1);
-			pfile_info->DeletePending = 0;
+		cifs_dbg(FYI, "Error in Open = %d\n", rc);
+		cifs_buf_release(req);
+		if (rc == -EAGAIN)
+			goto openRetry;
+		return rc;
+	}
+
+	/* 1 byte no need to le_to_cpu */
+	*oplock = rsp->OplockLevel;
+	/* cifs fid stays in le */
+	oparms->fid->netfid = rsp->Fid;
+
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction)
+		*oplock |= CIFS_CREATE_ACTION;
+
+	if (buf) {
+		/* copy from CreationTime to Attributes */
+		memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+		/* the file_info buf is endian converted by caller */
+		buf->AllocationSize = rsp->AllocationSize;
+		buf->EndOfFile = rsp->EndOfFile;
+		buf->NumberOfLinks = cpu_to_le32(1);
+		buf->DeletePending = 0;
+	}
+
+	cifs_buf_release(req);
+	return rc;
+}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	unsigned int rfclen = get_rfc1002_length(server->smallbuf);
+	int remaining = rfclen + 4 - server->total_read;
+	struct cifs_readdata *rdata = mid->callback_data;
+
+	while (remaining > 0) {
+		int length;
+
+		length = cifs_read_from_socket(server, server->bigbuf,
+				min_t(unsigned int, remaining,
+				    CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
+		if (length < 0)
+			return length;
+		server->total_read += length;
+		remaining -= length;
+	}
+
+	dequeue_mid(mid, rdata->result);
+	return 0;
+}
+
+int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length, len;
+	unsigned int data_offset, data_len;
+	struct cifs_readdata *rdata = mid->callback_data;
+	char *buf = server->smallbuf;
+	unsigned int buflen = get_rfc1002_length(buf) + 4;
+
+	cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
+		 __func__, mid->mid, rdata->offset, rdata->bytes);
+
+	/*
+	 * read the rest of READ_RSP header (sans Data array), or whatever we
+	 * can if there's not enough data. At this point, we've read down to
+	 * the Mid.
+	 */
+	len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
+							HEADER_SIZE(server) + 1;
+
+	rdata->iov.iov_base = buf + HEADER_SIZE(server) - 1;
+	rdata->iov.iov_len = len;
+
+	length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	/* Was the SMB read successful? */
+	rdata->result = server->ops->map_error(buf, false);
+	if (rdata->result != 0) {
+		cifs_dbg(FYI, "%s: server returned error %d\n",
+			 __func__, rdata->result);
+		return cifs_readv_discard(server, mid);
+	}
+
+	/* Is there enough to get to the rest of the READ_RSP header? */
+	if (server->total_read < server->vals->read_rsp_size) {
+		cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
+			 __func__, server->total_read,
+			 server->vals->read_rsp_size);
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	data_offset = server->ops->read_data_offset(buf) + 4;
+	if (data_offset < server->total_read) {
+		/*
+		 * win2k8 sometimes sends an offset of 0 when the read
+		 * is beyond the EOF. Treat it as if the data starts just after
+		 * the header.
+		 */
+		cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+			 __func__, data_offset);
+		data_offset = server->total_read;
+	} else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+		/* data_offset is beyond the end of smallbuf */
+		cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+			 __func__, data_offset);
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
+		 __func__, server->total_read, data_offset);
+
+	len = data_offset - server->total_read;
+	if (len > 0) {
+		/* read any junk before data into the rest of smallbuf */
+		rdata->iov.iov_base = buf + server->total_read;
+		rdata->iov.iov_len = len;
+		length = cifs_readv_from_socket(server, &rdata->iov, 1, len);
+		if (length < 0)
+			return length;
+		server->total_read += length;
+	}
+
+	/* set up first iov for signature check */
+	rdata->iov.iov_base = buf;
+	rdata->iov.iov_len = server->total_read;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov.iov_base, rdata->iov.iov_len);
+
+	/* how much data is in the response? */
+	data_len = server->ops->read_data_length(buf);
+	if (data_offset + data_len > buflen) {
+		/* data_len is corrupt -- discard frame */
+		rdata->result = -EIO;
+		return cifs_readv_discard(server, mid);
+	}
+
+	length = rdata->read_into_pages(server, rdata, data_len);
+	if (length < 0)
+		return length;
+
+	server->total_read += length;
+	rdata->bytes = length;
+
+	cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
+		 server->total_read, buflen, data_len);
+
+	/* discard anything left over */
+	if (server->total_read < buflen)
+		return cifs_readv_discard(server, mid);
+
+	dequeue_mid(mid, false);
+	return length;
+}
+
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1,
+				 .rq_pages = rdata->pages,
+				 .rq_npages = rdata->nr_pages,
+				 .rq_pagesz = rdata->pagesz,
+				 .rq_tailsz = rdata->tailsz };
+
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+		 __func__, mid->mid, mid->mid_state, rdata->result,
+		 rdata->bytes);
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		/* result already set, check signature */
+		if (server->sign) {
+			int rc = 0;
+
+			rc = cifs_verify_signature(&rqst, server,
+						  mid->sequence_number);
+			if (rc)
+				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+					 rc);
 		}
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->bytes);
+		cifs_stats_bytes_read(tcon, rdata->bytes);
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		rdata->result = -EAGAIN;
+		break;
+	default:
+		rdata->result = -EIO;
 	}
 
-	cifs_buf_release(pSMB);
-	if (rc == -EAGAIN)
-		goto openRetry;
+	queue_work(cifsiod_wq, &rdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(server, 1, 0);
+}
+
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+	int rc;
+	READ_REQ *smb = NULL;
+	int wct;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1 };
+
+	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+		 __func__, rdata->offset, rdata->bytes);
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES)
+		wct = 12;
+	else {
+		wct = 10; /* old style read */
+		if ((rdata->offset >> 32) > 0)  {
+			/* can not handle this big offset for old */
+			return -EIO;
+		}
+	}
+
+	rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+	if (rc)
+		return rc;
+
+	smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+	smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+
+	smb->AndXCommand = 0xFF;	/* none */
+	smb->Fid = rdata->cfile->fid.netfid;
+	smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+	if (wct == 12)
+		smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+	smb->Remaining = 0;
+	smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+	smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+	if (wct == 12)
+		smb->ByteCount = 0;
+	else {
+		/* old style read */
+		struct smb_com_readx_req *smbr =
+			(struct smb_com_readx_req *)smb;
+		smbr->ByteCount = 0;
+	}
+
+	/* 4 for RFC1001 length + 1 for BCC */
+	rdata->iov.iov_base = smb;
+	rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+	kref_get(&rdata->refcount);
+	rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
+			     cifs_readv_callback, rdata, 0);
+
+	if (rc == 0)
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
+	else
+		kref_put(&rdata->refcount, cifs_readdata_release);
+
+	cifs_small_buf_release(smb);
 	return rc;
 }
 
 int
-CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
-	    const unsigned int count, const __u64 lseek, unsigned int *nbytes,
-	    char **buf, int *pbuf_type)
+CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
+	    unsigned int *nbytes, char **buf, int *pbuf_type)
 {
 	int rc = -EACCES;
 	READ_REQ *pSMB = NULL;
@@ -1386,13 +1652,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	int wct;
 	int resp_buf_type = 0;
 	struct kvec iov[1];
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
 
-	cFYI(1, "Reading %d bytes on fid %d", count, netfid);
+	cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid);
 	if (tcon->ses->capabilities & CAP_LARGE_FILES)
 		wct = 12;
 	else {
 		wct = 10; /* old style read */
-		if ((lseek >> 32) > 0)  {
+		if ((offset >> 32) > 0)  {
 			/* can not handle this big offset for old */
 			return -EIO;
 		}
@@ -1403,15 +1674,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	if (rc)
 		return rc;
 
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
 	/* tcon and ses pointer are checked in smb_init */
 	if (tcon->ses->server == NULL)
 		return -ECONNABORTED;
 
 	pSMB->AndXCommand = 0xFF;       /* none */
 	pSMB->Fid = netfid;
-	pSMB->OffsetLow = cpu_to_le32(lseek & 0xFFFFFFFF);
+	pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
 	if (wct == 12)
-		pSMB->OffsetHigh = cpu_to_le32(lseek >> 32);
+		pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
 
 	pSMB->Remaining = 0;
 	pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
@@ -1426,13 +1700,13 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	}
 
 	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
 			 &resp_buf_type, CIFS_LOG_ERROR);
-	cifs_stats_inc(&tcon->num_reads);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
-		cERROR(1, "Send error in read = %d", rc);
+		cifs_dbg(VFS, "Send error in read = %d\n", rc);
 	} else {
 		int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
 		data_length = data_length << 16;
@@ -1442,7 +1716,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 		/*check that DataLength would not go beyond end of SMB */
 		if ((data_length > CIFSMaxBufSize)
 				|| (data_length > count)) {
-			cFYI(1, "bad length %d for count %d",
+			cifs_dbg(FYI, "bad length %d for count %d\n",
 				 data_length, count);
 			rc = -EIO;
 			*nbytes = 0;
@@ -1450,7 +1724,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 			pReadData = (char *) (&pSMBr->hdr.Protocol) +
 					le16_to_cpu(pSMBr->DataOffset);
 /*			if (rc = copy_to_user(buf, pReadData, data_length)) {
-				cERROR(1, "Faulting on read rc = %d",rc);
+				cifs_dbg(VFS, "Faulting on read rc = %d\n",rc);
 				rc = -EFAULT;
 			}*/ /* can not use copy_to_user when using page cache*/
 			if (*buf)
@@ -1480,9 +1754,8 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 
 
 int
-CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
-	     const int netfid, const unsigned int count,
-	     const __u64 offset, unsigned int *nbytes, const char *buf,
+CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
+	     unsigned int *nbytes, const char *buf,
 	     const char __user *ubuf, const int long_op)
 {
 	int rc = -EACCES;
@@ -1491,10 +1764,15 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned, wct;
 	__u32 bytes_sent;
 	__u16 byte_count;
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
 
 	*nbytes = 0;
 
-	/* cFYI(1, "write at %lld %d bytes", offset, count);*/
+	/* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
@@ -1512,6 +1790,10 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 		      (void **) &pSMBr);
 	if (rc)
 		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
 	/* tcon and ses pointer are checked in smb_init */
 	if (tcon->ses->server == NULL)
 		return -ECONNABORTED;
@@ -1560,7 +1842,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 
 	pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
 	pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 
 	if (wct == 14)
 		pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -1573,9 +1855,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
-	cifs_stats_inc(&tcon->num_writes);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
 	if (rc) {
-		cFYI(1, "Send error in write = %d", rc);
+		cifs_dbg(FYI, "Send error in write = %d\n", rc);
 	} else {
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
@@ -1598,21 +1880,251 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	return rc;
 }
 
+void
+cifs_writedata_release(struct kref *refcount)
+{
+	struct cifs_writedata *wdata = container_of(refcount,
+					struct cifs_writedata, refcount);
+
+	if (wdata->cfile)
+		cifsFileInfo_put(wdata->cfile);
+
+	kfree(wdata);
+}
+
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+	int i, rc;
+	struct inode *inode = wdata->cfile->dentry->d_inode;
+	struct TCP_Server_Info *server;
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		lock_page(wdata->pages[i]);
+		clear_page_dirty_for_io(wdata->pages[i]);
+	}
+
+	do {
+		server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+		rc = server->ops->async_writev(wdata, cifs_writedata_release);
+	} while (rc == -EAGAIN);
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		unlock_page(wdata->pages[i]);
+		if (rc != 0) {
+			SetPageError(wdata->pages[i]);
+			end_page_writeback(wdata->pages[i]);
+			page_cache_release(wdata->pages[i]);
+		}
+	}
+
+	mapping_set_error(inode->i_mapping, rc);
+	kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+void
+cifs_writev_complete(struct work_struct *work)
+{
+	struct cifs_writedata *wdata = container_of(work,
+						struct cifs_writedata, work);
+	struct inode *inode = wdata->cfile->dentry->d_inode;
+	int i = 0;
+
+	if (wdata->result == 0) {
+		spin_lock(&inode->i_lock);
+		cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+		spin_unlock(&inode->i_lock);
+		cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+					 wdata->bytes);
+	} else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+		return cifs_writev_requeue(wdata);
+
+	for (i = 0; i < wdata->nr_pages; i++) {
+		struct page *page = wdata->pages[i];
+		if (wdata->result == -EAGAIN)
+			__set_page_dirty_nobuffers(page);
+		else if (wdata->result < 0)
+			SetPageError(page);
+		end_page_writeback(page);
+		page_cache_release(page);
+	}
+	if (wdata->result != -EAGAIN)
+		mapping_set_error(inode->i_mapping, wdata->result);
+	kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+	struct cifs_writedata *wdata;
+
+	/* writedata + number of page pointers */
+	wdata = kzalloc(sizeof(*wdata) +
+			sizeof(struct page *) * nr_pages, GFP_NOFS);
+	if (wdata != NULL) {
+		kref_init(&wdata->refcount);
+		INIT_LIST_HEAD(&wdata->list);
+		init_completion(&wdata->done);
+		INIT_WORK(&wdata->work, complete);
+	}
+	return wdata;
+}
+
+/*
+ * Check the mid_state and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+cifs_writev_callback(struct mid_q_entry *mid)
+{
+	struct cifs_writedata *wdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	unsigned int written;
+	WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		wdata->result = cifs_check_receive(mid, tcon->ses->server, 0);
+		if (wdata->result != 0)
+			break;
+
+		written = le16_to_cpu(smb->CountHigh);
+		written <<= 16;
+		written += le16_to_cpu(smb->Count);
+		/*
+		 * Mask off high 16 bits when bytes written as returned
+		 * by the server is greater than bytes requested by the
+		 * client. OS/2 servers are known to set incorrect
+		 * CountHigh values.
+		 */
+		if (written > wdata->bytes)
+			written &= 0xFFFF;
+
+		if (written < wdata->bytes)
+			wdata->result = -ENOSPC;
+		else
+			wdata->bytes = written;
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		wdata->result = -EAGAIN;
+		break;
+	default:
+		wdata->result = -EIO;
+		break;
+	}
+
+	queue_work(cifsiod_wq, &wdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(tcon->ses->server, 1, 0);
+}
+
+/* cifs_async_writev - send an async write, and set up mid to handle result */
 int
-CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
-	     const int netfid, const unsigned int count,
-	     const __u64 offset, unsigned int *nbytes, struct kvec *iov,
-	     int n_vec, const int long_op)
+cifs_async_writev(struct cifs_writedata *wdata,
+		  void (*release)(struct kref *kref))
+{
+	int rc = -EACCES;
+	WRITE_REQ *smb = NULL;
+	int wct;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	struct kvec iov;
+	struct smb_rqst rqst = { };
+
+	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
+		wct = 14;
+	} else {
+		wct = 12;
+		if (wdata->offset >> 32 > 0) {
+			/* can not handle big offset for old srv */
+			return -EIO;
+		}
+	}
+
+	rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **)&smb);
+	if (rc)
+		goto async_writev_out;
+
+	smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid);
+	smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16));
+
+	smb->AndXCommand = 0xFF;	/* none */
+	smb->Fid = wdata->cfile->fid.netfid;
+	smb->OffsetLow = cpu_to_le32(wdata->offset & 0xFFFFFFFF);
+	if (wct == 14)
+		smb->OffsetHigh = cpu_to_le32(wdata->offset >> 32);
+	smb->Reserved = 0xFFFFFFFF;
+	smb->WriteMode = 0;
+	smb->Remaining = 0;
+
+	smb->DataOffset =
+	    cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
+
+	/* 4 for RFC1001 length + 1 for BCC */
+	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
+	iov.iov_base = smb;
+
+	rqst.rq_iov = &iov;
+	rqst.rq_nvec = 1;
+	rqst.rq_pages = wdata->pages;
+	rqst.rq_npages = wdata->nr_pages;
+	rqst.rq_pagesz = wdata->pagesz;
+	rqst.rq_tailsz = wdata->tailsz;
+
+	cifs_dbg(FYI, "async write at %llu %u bytes\n",
+		 wdata->offset, wdata->bytes);
+
+	smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
+	smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
+
+	if (wct == 14) {
+		inc_rfc1001_len(&smb->hdr, wdata->bytes + 1);
+		put_bcc(wdata->bytes + 1, &smb->hdr);
+	} else {
+		/* wct == 12 */
+		struct smb_com_writex_req *smbw =
+				(struct smb_com_writex_req *)smb;
+		inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
+		put_bcc(wdata->bytes + 5, &smbw->hdr);
+		iov.iov_len += 4; /* pad bigger by four bytes */
+	}
+
+	kref_get(&wdata->refcount);
+	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
+				cifs_writev_callback, wdata, 0);
+
+	if (rc == 0)
+		cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
+	else
+		kref_put(&wdata->refcount, release);
+
+async_writev_out:
+	cifs_small_buf_release(smb);
+	return rc;
+}
+
+int
+CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
+	      unsigned int *nbytes, struct kvec *iov, int n_vec)
 {
 	int rc = -EACCES;
 	WRITE_REQ *pSMB = NULL;
 	int wct;
 	int smb_hdr_len;
 	int resp_buf_type = 0;
+	__u32 pid = io_parms->pid;
+	__u16 netfid = io_parms->netfid;
+	__u64 offset = io_parms->offset;
+	struct cifs_tcon *tcon = io_parms->tcon;
+	unsigned int count = io_parms->length;
 
 	*nbytes = 0;
 
-	cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
+	cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count);
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
 		wct = 14;
@@ -1626,6 +2138,10 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 	rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB);
 	if (rc)
 		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid >> 16));
+
 	/* tcon and ses pointer are checked in smb_init */
 	if (tcon->ses->server == NULL)
 		return -ECONNABORTED;
@@ -1644,11 +2160,12 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 
 	pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF);
 	pSMB->DataLengthHigh = cpu_to_le16(count >> 16);
-	smb_hdr_len = pSMB->hdr.smb_buf_length + 1; /* hdr + 1 byte pad */
+	/* header + 1 byte pad */
+	smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1;
 	if (wct == 14)
-		pSMB->hdr.smb_buf_length += count+1;
+		inc_rfc1001_len(pSMB, count + 1);
 	else /* wct == 12 */
-		pSMB->hdr.smb_buf_length += count+5; /* smb data starts later */
+		inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */
 	if (wct == 14)
 		pSMB->ByteCount = cpu_to_le16(count + 1);
 	else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ {
@@ -1663,11 +2180,10 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 		iov[0].iov_len = smb_hdr_len + 8;
 
 
-	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type,
-			  long_op);
-	cifs_stats_inc(&tcon->num_writes);
+	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
 	if (rc) {
-		cFYI(1, "Send error Write2 = %d", rc);
+		cifs_dbg(FYI, "Send error Write2 = %d\n", rc);
 	} else if (resp_buf_type == 0) {
 		/* presumably this can not happen, but best to be safe */
 		rc = -EIO;
@@ -1698,10 +2214,51 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 	return rc;
 }
 
+int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+	       const __u16 netfid, const __u8 lock_type, const __u32 num_unlock,
+	       const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
+{
+	int rc = 0;
+	LOCK_REQ *pSMB = NULL;
+	struct kvec iov[2];
+	int resp_buf_type;
+	__u16 count;
+
+	cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n",
+		 num_lock, num_unlock);
+
+	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->Timeout = 0;
+	pSMB->NumberOfLocks = cpu_to_le16(num_lock);
+	pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
+	pSMB->LockType = lock_type;
+	pSMB->AndXCommand = 0xFF; /* none */
+	pSMB->Fid = netfid; /* netfid stays le */
+
+	count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+	inc_rfc1001_len(pSMB, count);
+	pSMB->ByteCount = cpu_to_le16(count);
+
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
+			 (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+	iov[1].iov_base = (char *)buf;
+	iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	if (rc)
+		cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc);
+
+	return rc;
+}
 
 int
-CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
-	    const __u16 smb_file_id, const __u64 len,
+CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
+	    const __u16 smb_file_id, const __u32 netpid, const __u64 len,
 	    const __u64 offset, const __u32 numUnlock,
 	    const __u32 numLock, const __u8 lockType,
 	    const bool waitFlag, const __u8 oplock_level)
@@ -1710,20 +2267,22 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	LOCK_REQ *pSMB = NULL;
 /*	LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
 	int bytes_returned;
-	int timeout = 0;
+	int flags = 0;
 	__u16 count;
 
-	cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
+	cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n",
+		 (int)waitFlag, numLock);
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 
 	if (rc)
 		return rc;
 
 	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
-		timeout = CIFS_ASYNC_OP; /* no response expected */
+		/* no response expected */
+		flags = CIFS_ASYNC_OP | CIFS_OBREAK_OP;
 		pSMB->Timeout = 0;
 	} else if (waitFlag) {
-		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
+		flags = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
 		pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */
 	} else {
 		pSMB->Timeout = 0;
@@ -1737,7 +2296,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	pSMB->Fid = smb_file_id; /* netfid stays le */
 
 	if ((numLock != 0) || (numUnlock != 0)) {
-		pSMB->Locks[0].Pid = cpu_to_le16(current->tgid);
+		pSMB->Locks[0].Pid = cpu_to_le16(netpid);
 		/* BB where to store pid high? */
 		pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
 		pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
@@ -1748,7 +2307,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 		/* oplock break */
 		count = 0;
 	}
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	if (waitFlag) {
@@ -1756,13 +2315,12 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			(struct smb_hdr *) pSMB, &bytes_returned);
 		cifs_small_buf_release(pSMB);
 	} else {
-		rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
-				      timeout);
+		rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, flags);
 		/* SMB buffer freed by function above */
 	}
-	cifs_stats_inc(&tcon->num_locks);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
 	if (rc)
-		cFYI(1, "Send error in Lock = %d", rc);
+		cifs_dbg(FYI, "Send error in Lock = %d\n", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	since file handle passed in no longer valid */
@@ -1770,8 +2328,9 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 }
 
 int
-CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
-		const __u16 smb_file_id, const int get_flag, const __u64 len,
+CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
+		const __u16 smb_file_id, const __u32 netpid,
+		const loff_t start_offset, const __u64 len,
 		struct file_lock *pLockData, const __u16 lock_type,
 		const bool waitFlag)
 {
@@ -1785,10 +2344,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, param_offset, offset, byte_count, count;
 	struct kvec iov[1];
 
-	cFYI(1, "Posix Lock");
-
-	if (pLockData == NULL)
-		return -EINVAL;
+	cifs_dbg(FYI, "Posix Lock\n");
 
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
@@ -1810,7 +2366,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
-	if (get_flag)
+	if (pLockData)
 		pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
 	else
 		pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
@@ -1831,22 +2387,22 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	} else
 		pSMB->Timeout = 0;
 
-	parm_data->pid = cpu_to_le32(current->tgid);
-	parm_data->start = cpu_to_le64(pLockData->fl_start);
+	parm_data->pid = cpu_to_le32(netpid);
+	parm_data->start = cpu_to_le64(start_offset);
 	parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
 
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->Fid = smb_file_id;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	if (waitFlag) {
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *) pSMBr, &bytes_returned);
 	} else {
 		iov[0].iov_base = (char *)pSMB;
-		iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+		iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 		rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
 				&resp_buf_type, timeout);
 		pSMB = NULL; /* request buf already freed by SendReceive2. Do
@@ -1855,14 +2411,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	}
 
 	if (rc) {
-		cFYI(1, "Send error in Posix Lock = %d", rc);
-	} else if (get_flag) {
+		cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc);
+	} else if (pLockData) {
 		/* lock structure can be returned on get */
 		__u16 data_offset;
 		__u16 data_count;
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) {
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) {
 			rc = -EIO;      /* bad smb */
 			goto plk_err_exit;
 		}
@@ -1884,10 +2440,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 					__constant_cpu_to_le16(CIFS_WRLCK))
 				pLockData->fl_type = F_WRLCK;
 
-			pLockData->fl_start = parm_data->start;
-			pLockData->fl_end = parm_data->start +
-						parm_data->length - 1;
-			pLockData->fl_pid = parm_data->pid;
+			pLockData->fl_start = le64_to_cpu(parm_data->start);
+			pLockData->fl_end = pLockData->fl_start +
+					le64_to_cpu(parm_data->length) - 1;
+			pLockData->fl_pid = le32_to_cpu(parm_data->pid);
 		}
 	}
 
@@ -1908,11 +2464,11 @@ plk_err_exit:
 
 
 int
-CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
 	int rc = 0;
 	CLOSE_REQ *pSMB = NULL;
-	cFYI(1, "In CIFSSMBClose");
+	cifs_dbg(FYI, "In CIFSSMBClose\n");
 
 /* do not retry on dead session on close */
 	rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1924,12 +2480,12 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	pSMB->FileID = (__u16) smb_file_id;
 	pSMB->LastWriteTime = 0xFFFFFFFF;
 	pSMB->ByteCount = 0;
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-	cifs_stats_inc(&tcon->num_closes);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_closes);
 	if (rc) {
 		if (rc != -EINTR) {
 			/* EINTR is expected when user ctl-c to kill app */
-			cERROR(1, "Send error in Close = %d", rc);
+			cifs_dbg(VFS, "Send error in Close = %d\n", rc);
 		}
 	}
 
@@ -1941,11 +2497,11 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 
 int
-CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
 	int rc = 0;
 	FLUSH_REQ *pSMB = NULL;
-	cFYI(1, "In CIFSSMBFlush");
+	cifs_dbg(FYI, "In CIFSSMBFlush\n");
 
 	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
 	if (rc)
@@ -1953,18 +2509,18 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 
 	pSMB->FileID = (__u16) smb_file_id;
 	pSMB->ByteCount = 0;
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-	cifs_stats_inc(&tcon->num_flushes);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes);
 	if (rc)
-		cERROR(1, "Send error in Flush = %d", rc);
+		cifs_dbg(VFS, "Send error in Flush = %d\n", rc);
 
 	return rc;
 }
 
 int
-CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
-	      const char *fromName, const char *toName,
-	      const struct nls_table *nls_codepage, int remap)
+CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+	      const char *from_name, const char *to_name,
+	      struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
 	RENAME_REQ *pSMB = NULL;
@@ -1972,8 +2528,9 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len, name_len2;
 	__u16 count;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSSMBRename");
+	cifs_dbg(FYI, "In CIFSSMBRename\n");
 renameRetry:
 	rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1986,40 +2543,41 @@ renameRetry:
 			ATTR_DIRECTORY);
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
-				     PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+					      from_name, PATH_MAX,
+					      cifs_sb->local_nls, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 		pSMB->OldFileName[name_len] = 0x04;	/* pad */
 	/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				     toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       to_name, PATH_MAX, cifs_sb->local_nls,
+				       remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fromName, PATH_MAX);
+		name_len = strnlen(from_name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->OldFileName, fromName, name_len);
-		name_len2 = strnlen(toName, PATH_MAX);
+		strncpy(pSMB->OldFileName, from_name, name_len);
+		name_len2 = strnlen(to_name, PATH_MAX);
 		name_len2++;	/* trailing null */
 		pSMB->OldFileName[name_len] = 0x04;  /* 2nd buffer format */
-		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
+		strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
 		name_len2++;	/* trailing null */
 		name_len2++;	/* signature byte */
 	}
 
 	count = 1 /* 1st signature byte */  + name_len + name_len2;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_renames);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_renames);
 	if (rc)
-		cFYI(1, "Send error in rename = %d", rc);
+		cifs_dbg(FYI, "Send error in rename = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2029,7 +2587,7 @@ renameRetry:
 	return rc;
 }
 
-int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
+int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
 		int netfid, const char *target_name,
 		const struct nls_table *nls_codepage, int remap)
 {
@@ -2043,7 +2601,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 	int len_of_str;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, "Rename to File by handle");
+	cifs_dbg(FYI, "Rename to File by handle\n");
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
 			(void **) &pSMBr);
 	if (rc)
@@ -2076,10 +2634,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 	/* unicode only call */
 	if (target_name == NULL) {
 		sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
-		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
 					dummy_string, 24, nls_codepage, remap);
 	} else {
-		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+		len_of_str =
+			cifsConvertToUTF16((__le16 *)rename_info->target_name,
 					target_name, PATH_MAX, nls_codepage,
 					remap);
 	}
@@ -2092,13 +2652,14 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 	pSMB->InformationLevel =
 		cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&pTcon->num_t2renames);
+	cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames);
 	if (rc)
-		cFYI(1, "Send error in Rename (by file handle) = %d", rc);
+		cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n",
+			 rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2109,9 +2670,9 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 }
 
 int
-CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
-	    const __u16 target_tid, const char *toName, const int flags,
-	    const struct nls_table *nls_codepage, int remap)
+CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon,
+	    const char *fromName, const __u16 target_tid, const char *toName,
+	    const int flags, const struct nls_table *nls_codepage, int remap)
 {
 	int rc = 0;
 	COPY_REQ *pSMB = NULL;
@@ -2120,7 +2681,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, "In CIFSSMBCopy");
+	cifs_dbg(FYI, "In CIFSSMBCopy\n");
 copyRetry:
 	rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
 			(void **) &pSMBr);
@@ -2133,17 +2694,17 @@ copyRetry:
 	pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
-					    fromName, PATH_MAX, nls_codepage,
-					    remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+					      fromName, PATH_MAX, nls_codepage,
+					      remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 		pSMB->OldFileName[name_len] = 0x04;     /* pad */
 		/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2; /* convert to bytes */
 	} else { 	/* BB improve the check for buffer overruns BB */
@@ -2159,14 +2720,14 @@ copyRetry:
 	}
 
 	count = 1 /* 1st signature byte */  + name_len + name_len2;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in copy = %d with %d files copied",
-			rc, le16_to_cpu(pSMBr->CopyCount));
+		cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n",
+			 rc, le16_to_cpu(pSMBr->CopyCount));
 	}
 	cifs_buf_release(pSMB);
 
@@ -2177,7 +2738,7 @@ copyRetry:
 }
 
 int
-CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon,
 		      const char *fromName, const char *toName,
 		      const struct nls_table *nls_codepage)
 {
@@ -2190,7 +2751,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In Symlink Unix style");
+	cifs_dbg(FYI, "In Symlink Unix style\n");
 createSymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2199,9 +2760,9 @@ createSymLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
-				  /* find define for this maxpathcomponent */
-				  , nls_codepage);
+		    cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
+				    /* find define for this maxpathcomponent */
+				    PATH_MAX, nls_codepage);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2223,9 +2784,9 @@ createSymLinkRetry:
 	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len_target =
-		    cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
-				  /* find define for this maxpathcomponent */
-				  , nls_codepage);
+		    cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
+				    /* find define for this maxpathcomponent */
+				    , nls_codepage);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2249,13 +2810,14 @@ createSymLinkRetry:
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_symlinks);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks);
 	if (rc)
-		cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
+		cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n",
+			 rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2266,7 +2828,7 @@ createSymLinkRetry:
 }
 
 int
-CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
+CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
 		       const char *fromName, const char *toName,
 		       const struct nls_table *nls_codepage, int remap)
 {
@@ -2279,7 +2841,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In Create Hard link Unix style");
+	cifs_dbg(FYI, "In Create Hard link Unix style\n");
 createHardLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2287,8 +2849,8 @@ createHardLinkRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
-					    PATH_MAX, nls_codepage, remap);
+		name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+					      PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2310,8 +2872,8 @@ createHardLinkRetry:
 	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len_target =
-		    cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
-				     nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) data_offset, fromName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len_target++;	/* trailing null */
 		name_len_target *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2335,13 +2897,14 @@ createHardLinkRetry:
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_hardlinks);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
 	if (rc)
-		cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
+		cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n",
+			 rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2351,9 +2914,9 @@ createHardLinkRetry:
 }
 
 int
-CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
-		   const char *fromName, const char *toName,
-		   const struct nls_table *nls_codepage, int remap)
+CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *from_name, const char *to_name,
+		   struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
 	NT_RENAME_REQ *pSMB = NULL;
@@ -2361,8 +2924,9 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len, name_len2;
 	__u16 count;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSCreateHardLink");
+	cifs_dbg(FYI, "In CIFSCreateHardLink\n");
 winCreateHardLinkRetry:
 
 	rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2380,8 +2944,8 @@ winCreateHardLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, from_name,
+				       PATH_MAX, cifs_sb->local_nls, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 
@@ -2389,31 +2953,32 @@ winCreateHardLinkRetry:
 		pSMB->OldFileName[name_len] = 0x04;
 		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
 		name_len2 =
-		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
-				     toName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+				       to_name, PATH_MAX, cifs_sb->local_nls,
+				       remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fromName, PATH_MAX);
+		name_len = strnlen(from_name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->OldFileName, fromName, name_len);
-		name_len2 = strnlen(toName, PATH_MAX);
+		strncpy(pSMB->OldFileName, from_name, name_len);
+		name_len2 = strnlen(to_name, PATH_MAX);
 		name_len2++;	/* trailing null */
 		pSMB->OldFileName[name_len] = 0x04;	/* 2nd buffer format */
-		strncpy(&pSMB->OldFileName[name_len + 1], toName, name_len2);
+		strncpy(&pSMB->OldFileName[name_len + 1], to_name, name_len2);
 		name_len2++;	/* trailing null */
 		name_len2++;	/* signature byte */
 	}
 
 	count = 1 /* string type byte */  + name_len + name_len2;
-	pSMB->hdr.smb_buf_length += count;
+	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_hardlinks);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
 	if (rc)
-		cFYI(1, "Send error in hard link (NT rename) = %d", rc);
+		cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2423,7 +2988,7 @@ winCreateHardLinkRetry:
 }
 
 int
-CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 			const unsigned char *searchName, char **symlinkinfo,
 			const struct nls_table *nls_codepage)
 {
@@ -2436,7 +3001,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, byte_count;
 	char *data_start;
 
-	cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
+	cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName);
 
 querySymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2446,8 +3011,8 @@ querySymLinkRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
-				  PATH_MAX, nls_codepage);
+			cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
+					PATH_MAX, nls_codepage);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2477,19 +3042,19 @@ querySymLinkRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc);
 	} else {
 		/* decode response */
 
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		/* BB also check enough total bytes returned */
-		if (rc || (pSMBr->ByteCount < 2))
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			rc = -EIO;
 		else {
 			bool is_unicode;
@@ -2504,8 +3069,8 @@ querySymLinkRetry:
 				is_unicode = false;
 
 			/* BB FIXME investigate remapping reserved chars here */
-			*symlinkinfo = cifs_strndup_from_ucs(data_start, count,
-						    is_unicode, nls_codepage);
+			*symlinkinfo = cifs_strndup_from_utf16(data_start,
+					count, is_unicode, nls_codepage);
 			if (!*symlinkinfo)
 				rc = -ENOMEM;
 		}
@@ -2516,19 +3081,34 @@ querySymLinkRetry:
 	return rc;
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+/*
+ *	Recent Windows versions now create symlinks more frequently
+ *	and they use the "reparse point" mechanism below.  We can of course
+ *	do symlinks nicely to Samba and other servers which support the
+ *	CIFS Unix Extensions and we can also do SFU symlinks and "client only"
+ *	"MF" symlinks optionally, but for recent Windows we really need to
+ *	reenable the code below and fix the cifs_symlink callers to handle this.
+ *	In the interim this code has been moved to its own config option so
+ *	it is not compiled in by default until callers fixed up and more tested.
+ */
 int
-CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			char *symlinkinfo, const int buflen, __u16 fid,
-			const struct nls_table *nls_codepage)
+CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid, char **symlinkinfo,
+		    const struct nls_table *nls_codepage)
 {
 	int rc = 0;
 	int bytes_returned;
 	struct smb_com_transaction_ioctl_req *pSMB;
 	struct smb_com_transaction_ioctl_rsp *pSMBr;
+	bool is_unicode;
+	unsigned int sub_len;
+	char *sub_start;
+	struct reparse_symlink_data *reparse_buf;
+	struct reparse_posix_data *posix_buf;
+	__u32 data_offset, data_count;
+	char *end_of_smb;
 
-	cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
+	cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -2538,8 +3118,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le32(2);
 	/* BB find exact data count max from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 4;
 	pSMB->Reserved = 0;
 	pSMB->ParameterOffset = 0;
@@ -2557,68 +3136,137 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
-	} else {		/* decode response */
-		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
-		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
-		if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
+		cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
+		goto qreparse_out;
+	}
+
+	data_offset = le32_to_cpu(pSMBr->DataOffset);
+	data_count = le32_to_cpu(pSMBr->DataCount);
+	if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
 		/* BB also check enough total bytes returned */
-			rc = -EIO;	/* bad smb */
+		rc = -EIO;	/* bad smb */
+		goto qreparse_out;
+	}
+	if (!data_count || (data_count > 2048)) {
+		rc = -EIO;
+		cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
+		goto qreparse_out;
+	}
+	end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
+	reparse_buf = (struct reparse_symlink_data *)
+				((char *)&pSMBr->hdr.Protocol + data_offset);
+	if ((char *)reparse_buf >= end_of_smb) {
+		rc = -EIO;
+		goto qreparse_out;
+	}
+	if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) {
+		cifs_dbg(FYI, "NFS style reparse tag\n");
+		posix_buf =  (struct reparse_posix_data *)reparse_buf;
+
+		if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) {
+			cifs_dbg(FYI, "unsupported file type 0x%llx\n",
+				 le64_to_cpu(posix_buf->InodeType));
+			rc = -EOPNOTSUPP;
 			goto qreparse_out;
 		}
-		if (data_count && (data_count < 2048)) {
-			char *end_of_smb = 2 /* sizeof byte count */ +
-				pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
-
-			struct reparse_data *reparse_buf =
-						(struct reparse_data *)
-						((char *)&pSMBr->hdr.Protocol
-								 + data_offset);
-			if ((char *)reparse_buf >= end_of_smb) {
-				rc = -EIO;
-				goto qreparse_out;
-			}
-			if ((reparse_buf->LinkNamesBuf +
-				reparse_buf->TargetNameOffset +
-				reparse_buf->TargetNameLen) > end_of_smb) {
-				cFYI(1, "reparse buf beyond SMB");
-				rc = -EIO;
-				goto qreparse_out;
-			}
-
-			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-				cifs_from_ucs2(symlinkinfo, (__le16 *)
-						(reparse_buf->LinkNamesBuf +
-						reparse_buf->TargetNameOffset),
-						buflen,
-						reparse_buf->TargetNameLen,
-						nls_codepage, 0);
-			} else { /* ASCII names */
-				strncpy(symlinkinfo,
-					reparse_buf->LinkNamesBuf +
-					reparse_buf->TargetNameOffset,
-					min_t(const int, buflen,
-					   reparse_buf->TargetNameLen));
-			}
-		} else {
+		is_unicode = true;
+		sub_len = le16_to_cpu(reparse_buf->ReparseDataLength);
+		if (posix_buf->PathBuffer + sub_len > end_of_smb) {
+			cifs_dbg(FYI, "reparse buf beyond SMB\n");
 			rc = -EIO;
-			cFYI(1, "Invalid return data count on "
-				 "get reparse info ioctl");
+			goto qreparse_out;
 		}
-		symlinkinfo[buflen] = 0; /* just in case so the caller
-					does not go off the end of the buffer */
-		cFYI(1, "readlink result - %s", symlinkinfo);
+		*symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer,
+				sub_len, is_unicode, nls_codepage);
+		goto qreparse_out;
+	} else if (reparse_buf->ReparseTag !=
+			cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) {
+		rc = -EOPNOTSUPP;
+		goto qreparse_out;
+	}
+
+	/* Reparse tag is NTFS symlink */
+	sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) +
+				reparse_buf->PathBuffer;
+	sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength);
+	if (sub_start + sub_len > end_of_smb) {
+		cifs_dbg(FYI, "reparse buf beyond SMB\n");
+		rc = -EIO;
+		goto qreparse_out;
 	}
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
 
+	/* BB FIXME investigate remapping reserved chars here */
+	*symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode,
+					       nls_codepage);
+	if (!*symlinkinfo)
+		rc = -ENOMEM;
 qreparse_out:
 	cifs_buf_release(pSMB);
 
-	/* Note: On -EAGAIN error only caller can retry on handle based calls
-		since file handle passed in no longer valid */
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
+	return rc;
+}
+
+int
+CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid)
+{
+	int rc = 0;
+	int bytes_returned;
+	struct smb_com_transaction_compr_ioctl_req *pSMB;
+	struct smb_com_transaction_ioctl_rsp *pSMBr;
+
+	cifs_dbg(FYI, "Set compression for %u\n", fid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
 
+	pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	pSMB->TotalParameterCount = 0;
+	pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+	pSMB->MaxParameterCount = 0;
+	pSMB->MaxDataCount = 0;
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = __constant_cpu_to_le32(2);
+	pSMB->DataOffset =
+		cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
+				compression_state) - 4);  /* 84 */
+	pSMB->SetupCount = 4;
+	pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+	pSMB->ParameterCount = 0;
+	pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+	pSMB->IsFsctl = 1; /* FSCTL */
+	pSMB->IsRootFlag = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	/* 3 byte pad, followed by 2 byte compress state */
+	pSMB->ByteCount = __constant_cpu_to_le16(5);
+	inc_rfc1001_len(pSMB, 5);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
 	return rc;
 }
-#endif /* CIFS_EXPERIMENTAL */
+
 
 #ifdef CONFIG_CIFS_POSIX
 
@@ -2630,7 +3278,10 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
 	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
 	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
 	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-	/* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
+/*
+	cifs_dbg(FYI, "perm %d tag %d id %d\n",
+		 ace->e_perm, ace->e_tag, ace->e_id);
+*/
 
 	return;
 }
@@ -2656,8 +3307,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
 		size += sizeof(struct cifs_posix_ace) * count;
 		/* check if we would go beyond end of SMB */
 		if (size_of_data_area < size) {
-			cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
-				size_of_data_area, size);
+			cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n",
+				 size_of_data_area, size);
 			return -EINVAL;
 		}
 	} else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2704,7 +3355,10 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
 		cifs_ace->cifs_uid = cpu_to_le64(-1);
 	} else
 		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-	/*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
+/*
+	cifs_dbg(FYI, "perm %d tag %d id %d\n",
+		 ace->e_perm, ace->e_tag, ace->e_id);
+*/
 	return rc;
 }
 
@@ -2722,21 +3376,22 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 		return 0;
 
 	count = posix_acl_xattr_count((size_t)buflen);
-	cFYI(1, "setting acl with %d entries from buf of length %d and "
-		"version of %d",
-		count, buflen, le32_to_cpu(local_acl->a_version));
+	cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
+		 count, buflen, le32_to_cpu(local_acl->a_version));
 	if (le32_to_cpu(local_acl->a_version) != 2) {
-		cFYI(1, "unknown POSIX ACL version %d",
-		     le32_to_cpu(local_acl->a_version));
+		cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
+			 le32_to_cpu(local_acl->a_version));
 		return 0;
 	}
 	cifs_acl->version = cpu_to_le16(1);
-	if (acl_type == ACL_TYPE_ACCESS)
+	if (acl_type == ACL_TYPE_ACCESS) {
 		cifs_acl->access_entry_count = cpu_to_le16(count);
-	else if (acl_type == ACL_TYPE_DEFAULT)
+		cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+	} else if (acl_type == ACL_TYPE_DEFAULT) {
 		cifs_acl->default_entry_count = cpu_to_le16(count);
-	else {
-		cFYI(1, "unknown ACL type %d", acl_type);
+		cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+	} else {
+		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
 		return 0;
 	}
 	for (i = 0; i < count; i++) {
@@ -2756,7 +3411,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 }
 
 int
-CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		   const unsigned char *searchName,
 		   char *acl_inf, const int buflen, const int acl_type,
 		   const struct nls_table *nls_codepage, int remap)
@@ -2769,7 +3424,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
+	cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName);
 
 queryAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2779,8 +3434,9 @@ queryAclRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					 PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 		pSMB->FileName[name_len] = 0;
@@ -2814,20 +3470,20 @@ queryAclRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_acl_get);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
 	if (rc) {
-		cFYI(1, "Send error in Query POSIX ACL = %d", rc);
+		cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc);
 	} else {
 		/* decode response */
 
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -2844,7 +3500,7 @@ queryAclRetry:
 }
 
 int
-CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		   const unsigned char *fileName,
 		   const char *local_acl, const int buflen,
 		   const int acl_type,
@@ -2858,7 +3514,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
+	cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName);
 setAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2866,8 +3522,8 @@ setAclRetry:
 		return rc;
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				      PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+					   PATH_MAX, nls_codepage, remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -2908,12 +3564,12 @@ setAclRetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "Set POSIX ACL returned %d", rc);
+		cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc);
 
 setACLerrorExit:
 	cifs_buf_release(pSMB);
@@ -2924,7 +3580,7 @@ setACLerrorExit:
 
 /* BB fix tabs in this function FIXME BB */
 int
-CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
+CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 	       const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
 {
 	int rc = 0;
@@ -2933,7 +3589,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, "In GetExtAttr");
+	cifs_dbg(FYI, "In GetExtAttr\n");
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -2966,18 +3622,18 @@ GetExtAttrRetry:
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS);
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "error %d in GetExtAttr", rc);
+		cifs_dbg(FYI, "error %d in GetExtAttr\n", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			/* If rc should we check for EOPNOSUPP and
 			   disable the srvino flag? or in caller? */
 			rc = -EIO;      /* bad smb */
@@ -2987,7 +3643,7 @@ GetExtAttrRetry:
 			struct file_chattr_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count != 16) {
-				cFYI(1, "Illegal size ret in GetExtAttr");
+				cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n");
 				rc = -EIO;
 				goto GetExtAttrOut;
 			}
@@ -3017,7 +3673,7 @@ GetExtAttrOut:
  */
 static int
 smb_init_nttransact(const __u16 sub_command, const int setup_count,
-		   const int parm_len, struct cifsTconInfo *tcon,
+		   const int parm_len, struct cifs_tcon *tcon,
 		   void **ret_buf)
 {
 	int rc;
@@ -3032,8 +3688,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
 	pSMB->Reserved = 0;
 	pSMB->TotalParameterCount = cpu_to_le32(parm_len);
 	pSMB->TotalDataCount  = 0;
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->DataCount  = pSMB->TotalDataCount;
 	temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3052,6 +3707,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 	char *end_of_smb;
 	__u32 data_count, data_offset, parm_count, parm_offset;
 	struct smb_com_ntransact_rsp *pSMBr;
+	u16 bcc;
 
 	*pdatalen = 0;
 	*pparmlen = 0;
@@ -3061,8 +3717,8 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 	pSMBr = (struct smb_com_ntransact_rsp *)buf;
 
-	/* ByteCount was converted from little endian in SendReceive */
-	end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+	bcc = get_bcc(&pSMBr->hdr);
+	end_of_smb = 2 /* sizeof byte count */ + bcc +
 			(char *)&pSMBr->ByteCount;
 
 	data_offset = le32_to_cpu(pSMBr->DataOffset);
@@ -3075,21 +3731,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 	/* should we also check that parm and data areas do not overlap? */
 	if (*ppparm > end_of_smb) {
-		cFYI(1, "parms start after end of smb");
+		cifs_dbg(FYI, "parms start after end of smb\n");
 		return -EINVAL;
 	} else if (parm_count + *ppparm > end_of_smb) {
-		cFYI(1, "parm end after end of smb");
+		cifs_dbg(FYI, "parm end after end of smb\n");
 		return -EINVAL;
 	} else if (*ppdata > end_of_smb) {
-		cFYI(1, "data starts after end of smb");
+		cifs_dbg(FYI, "data starts after end of smb\n");
 		return -EINVAL;
 	} else if (data_count + *ppdata > end_of_smb) {
-		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-			*ppdata, data_count, (data_count + *ppdata),
-			end_of_smb, pSMBr);
+		cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n",
+			 *ppdata, data_count, (data_count + *ppdata),
+			 end_of_smb, pSMBr);
 		return -EINVAL;
-	} else if (parm_count + data_count > pSMBr->ByteCount) {
-		cFYI(1, "parm count and data count larger than SMB");
+	} else if (parm_count + data_count > bcc) {
+		cifs_dbg(FYI, "parm count and data count larger than SMB\n");
 		return -EINVAL;
 	}
 	*pdatalen = data_count;
@@ -3099,7 +3755,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
-CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 		  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
 {
 	int rc = 0;
@@ -3107,7 +3763,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	QUERY_SEC_DESC_REQ *pSMB;
 	struct kvec iov[1];
 
-	cFYI(1, "GetCifsACL");
+	cifs_dbg(FYI, "GetCifsACL\n");
 
 	*pbuflen = 0;
 	*acl_inf = NULL;
@@ -3124,15 +3780,15 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
 				     CIFS_ACL_DACL);
 	pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
-	pSMB->hdr.smb_buf_length += 11;
+	inc_rfc1001_len(pSMB, 11);
 	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
 			 0);
-	cifs_stats_inc(&tcon->num_acl_get);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
 	if (rc) {
-		cFYI(1, "Send error in QuerySecDesc = %d", rc);
+		cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc);
 	} else {                /* decode response */
 		__le32 *parm;
 		__u32 parm_len;
@@ -3147,7 +3803,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 			goto qsec_out;
 		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
 
-		cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
+		cifs_dbg(FYI, "smb %p parm %p data %p\n",
+			 pSMBr, parm, *acl_inf);
 
 		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
 			rc = -EIO;      /* bad smb */
@@ -3159,8 +3816,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 
 		acl_len = le32_to_cpu(*parm);
 		if (acl_len != *pbuflen) {
-			cERROR(1, "acl length %d does not match %d",
-				   acl_len, *pbuflen);
+			cifs_dbg(VFS, "acl length %d does not match %d\n",
+				 acl_len, *pbuflen);
 			if (*pbuflen > acl_len)
 				*pbuflen = acl_len;
 		}
@@ -3169,16 +3826,15 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 		   header followed by the smallest SID */
 		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
 		    (*pbuflen >= 64 * 1024)) {
-			cERROR(1, "bad acl length %d", *pbuflen);
+			cifs_dbg(VFS, "bad acl length %d\n", *pbuflen);
 			rc = -EINVAL;
 			*pbuflen = 0;
 		} else {
-			*acl_inf = kmalloc(*pbuflen, GFP_KERNEL);
+			*acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL);
 			if (*acl_inf == NULL) {
 				*pbuflen = 0;
 				rc = -ENOMEM;
 			}
-			memcpy(*acl_inf, pdata, *pbuflen);
 		}
 	}
 qsec_out:
@@ -3191,20 +3847,19 @@ qsec_out:
 }
 
 int
-CIFSSMBSetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
-			struct cifs_ntsd *pntsd, __u32 acllen)
+CIFSSMBSetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
+			struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
 {
 	__u16 byte_count, param_count, data_count, param_offset, data_offset;
 	int rc = 0;
 	int bytes_returned = 0;
 	SET_SEC_DESC_REQ *pSMB = NULL;
-	NTRANSACT_RSP *pSMBr = NULL;
+	void *pSMBr;
 
 setCifsAclRetry:
-	rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB,
-			(void **) &pSMBr);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB, &pSMBr);
 	if (rc)
-			return (rc);
+		return rc;
 
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
@@ -3229,23 +3884,22 @@ setCifsAclRetry:
 
 	pSMB->Fid = fid; /* file handle always le */
 	pSMB->Reserved2 = 0;
-	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+	pSMB->AclFlags = cpu_to_le32(aclflag);
 
 	if (pntsd && acllen) {
-		memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
-			(char *) pntsd,
-			acllen);
-		pSMB->hdr.smb_buf_length += (byte_count + data_count);
-
+		memcpy((char *)pSMBr + offsetof(struct smb_hdr, Protocol) +
+				data_offset, pntsd, acllen);
+		inc_rfc1001_len(pSMB, byte_count + data_count);
 	} else
-		pSMB->hdr.smb_buf_length += byte_count;
+		inc_rfc1001_len(pSMB, byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 
-	cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
+	cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n",
+		 bytes_returned, rc);
 	if (rc)
-		cFYI(1, "Set CIFS ACL returned %d", rc);
+		cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc);
 	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
@@ -3258,10 +3912,10 @@ setCifsAclRetry:
 
 /* Legacy Query Path Information call for lookup to old servers such
    as Win9x/WinME */
-int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			FILE_ALL_INFO *pFinfo,
-			const struct nls_table *nls_codepage, int remap)
+int
+SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon,
+		    const char *search_name, FILE_ALL_INFO *data,
+		    const struct nls_table *nls_codepage, int remap)
 {
 	QUERY_INFORMATION_REQ *pSMB;
 	QUERY_INFORMATION_RSP *pSMBr;
@@ -3269,7 +3923,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, "In SMBQPath path %s", searchName);
+	cifs_dbg(FYI, "In SMBQPath path %s\n", search_name);
 QInfRetry:
 	rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3278,41 +3932,42 @@ QInfRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   search_name, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {
-		name_len = strnlen(searchName, PATH_MAX);
+		name_len = strnlen(search_name, PATH_MAX);
 		name_len++;     /* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+		strncpy(pSMB->FileName, search_name, name_len);
 	}
 	pSMB->BufferFormat = 0x04;
 	name_len++; /* account for buffer type byte */
-	pSMB->hdr.smb_buf_length += (__u16) name_len;
+	inc_rfc1001_len(pSMB, (__u16)name_len);
 	pSMB->ByteCount = cpu_to_le16(name_len);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QueryInfo = %d", rc);
-	} else if (pFinfo) {
+		cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc);
+	} else if (data) {
 		struct timespec ts;
 		__u32 time = le32_to_cpu(pSMBr->last_write_time);
 
 		/* decode response */
 		/* BB FIXME - add time zone adjustment BB */
-		memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
+		memset(data, 0, sizeof(FILE_ALL_INFO));
 		ts.tv_nsec = 0;
 		ts.tv_sec = time;
 		/* decode time fields */
-		pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
-		pFinfo->LastWriteTime = pFinfo->ChangeTime;
-		pFinfo->LastAccessTime = 0;
-		pFinfo->AllocationSize =
+		data->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
+		data->LastWriteTime = data->ChangeTime;
+		data->LastAccessTime = 0;
+		data->AllocationSize =
 			cpu_to_le64(le32_to_cpu(pSMBr->size));
-		pFinfo->EndOfFile = pFinfo->AllocationSize;
-		pFinfo->Attributes =
+		data->EndOfFile = data->AllocationSize;
+		data->Attributes =
 			cpu_to_le32(le16_to_cpu(pSMBr->attr));
 	} else
 		rc = -EIO; /* bad buffer passed in */
@@ -3326,7 +3981,7 @@ QInfRetry:
 }
 
 int
-CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		 u16 netfid, FILE_ALL_INFO *pFindData)
 {
 	struct smb_t2_qfi_req *pSMB = NULL;
@@ -3364,18 +4019,19 @@ QFileInfoRetry:
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFileInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc) /* BB add auto retry on EOPNOTSUPP? */
 			rc = -EIO;
-		else if (pSMBr->ByteCount < 40)
+		else if (get_bcc(&pSMBr->hdr) < 40)
 			rc = -EIO;	/* bad smb */
 		else if (pFindData) {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3393,13 +4049,12 @@ QFileInfoRetry:
 }
 
 int
-CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
-		 const unsigned char *searchName,
-		 FILE_ALL_INFO *pFindData,
+CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		 const char *search_name, FILE_ALL_INFO *data,
 		 int legacy /* old style infolevel */,
 		 const struct nls_table *nls_codepage, int remap)
 {
-/* level 263 SMB_QUERY_FILE_ALL_INFO */
+	/* level 263 SMB_QUERY_FILE_ALL_INFO */
 	TRANSACTION2_QPI_REQ *pSMB = NULL;
 	TRANSACTION2_QPI_RSP *pSMBr = NULL;
 	int rc = 0;
@@ -3407,7 +4062,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-/* cFYI(1, "In QPathInfo path %s", searchName); */
+	/* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */
 QPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3416,14 +4071,14 @@ QPathInfoRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, search_name,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
+		name_len = strnlen(search_name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+		strncpy(pSMB->FileName, search_name, name_len);
 	}
 
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
@@ -3451,37 +4106,38 @@ QPathInfoRetry:
 	else
 		pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc) /* BB add auto retry on EOPNOTSUPP? */
 			rc = -EIO;
-		else if (!legacy && (pSMBr->ByteCount < 40))
+		else if (!legacy && get_bcc(&pSMBr->hdr) < 40)
 			rc = -EIO;	/* bad smb */
-		else if (legacy && (pSMBr->ByteCount < 24))
+		else if (legacy && get_bcc(&pSMBr->hdr) < 24)
 			rc = -EIO;  /* 24 or 26 expected but we do not read
 					last field */
-		else if (pFindData) {
+		else if (data) {
 			int size;
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
 
-			/* On legacy responses we do not read the last field,
-			EAsize, fortunately since it varies by subdialect and
-			also note it differs on Set vs. Get, ie two bytes or 4
-			bytes depending but we don't care here */
+			/*
+			 * On legacy responses we do not read the last field,
+			 * EAsize, fortunately since it varies by subdialect and
+			 * also note it differs on Set vs Get, ie two bytes or 4
+			 * bytes depending but we don't care here.
+			 */
 			if (legacy)
 				size = sizeof(FILE_INFO_STANDARD);
 			else
 				size = sizeof(FILE_ALL_INFO);
-			memcpy((char *) pFindData,
-			       (char *) &pSMBr->hdr.Protocol +
+			memcpy((char *) data, (char *) &pSMBr->hdr.Protocol +
 			       data_offset, size);
 		} else
 		    rc = -ENOMEM;
@@ -3494,7 +4150,7 @@ QPathInfoRetry:
 }
 
 int
-CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
 {
 	struct smb_t2_qfi_req *pSMB = NULL;
@@ -3532,19 +4188,18 @@ UnixQFileInfoRetry:
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
-				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option.");
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
+			cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3563,7 +4218,7 @@ UnixQFileInfoRetry:
 }
 
 int
-CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		     const unsigned char *searchName,
 		     FILE_UNIX_BASIC_INFO *pFindData,
 		     const struct nls_table *nls_codepage, int remap)
@@ -3576,7 +4231,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
+	cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName);
 UnixQPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3585,8 +4240,8 @@ UnixQPathInfoRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				  PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -3617,20 +4272,18 @@ UnixQPathInfoRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
-				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option.");
+		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
+			cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3649,11 +4302,10 @@ UnixQPathInfoRetry:
 
 /* xid, tcon, searchName and codepage are input parms, rest are returned */
 int
-CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
-	      const char *searchName,
-	      const struct nls_table *nls_codepage,
-	      __u16 *pnetfid,
-	      struct cifs_search_info *psrch_inf, int remap, const char dirsep)
+CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
+	      const char *searchName, struct cifs_sb_info *cifs_sb,
+	      __u16 *pnetfid, __u16 search_flags,
+	      struct cifs_search_info *psrch_inf, bool msearch)
 {
 /* level 257 SMB_ */
 	TRANSACTION2_FFIRST_REQ *pSMB = NULL;
@@ -3661,10 +4313,11 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
 	T2_FFIRST_RSP_PARMS *parms;
 	int rc = 0;
 	int bytes_returned = 0;
-	int name_len;
+	int name_len, remap;
 	__u16 params, byte_count;
+	struct nls_table *nls_codepage;
 
-	cFYI(1, "In FindFirst for %s", searchName);
+	cifs_dbg(FYI, "In FindFirst for %s\n", searchName);
 
 findFirstRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3672,39 +4325,46 @@ findFirstRetry:
 	if (rc)
 		return rc;
 
+	nls_codepage = cifs_sb->local_nls;
+	remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				 PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		/* We can not add the asterik earlier in case
 		it got remapped to 0xF03A as if it were part of the
 		directory name instead of a wildcard */
 		name_len *= 2;
-		pSMB->FileName[name_len] = dirsep;
-		pSMB->FileName[name_len+1] = 0;
-		pSMB->FileName[name_len+2] = '*';
-		pSMB->FileName[name_len+3] = 0;
-		name_len += 4; /* now the trailing null */
-		pSMB->FileName[name_len] = 0; /* null terminate just in case */
-		pSMB->FileName[name_len+1] = 0;
-		name_len += 2;
+		if (msearch) {
+			pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
+			pSMB->FileName[name_len+1] = 0;
+			pSMB->FileName[name_len+2] = '*';
+			pSMB->FileName[name_len+3] = 0;
+			name_len += 4; /* now the trailing null */
+			/* null terminate just in case */
+			pSMB->FileName[name_len] = 0;
+			pSMB->FileName[name_len+1] = 0;
+			name_len += 2;
+		}
 	} else {	/* BB add check for overrun of SMB buf BB */
 		name_len = strnlen(searchName, PATH_MAX);
 /* BB fix here and in unicode clause above ie
 		if (name_len > buffersize-header)
 			free buffer exit; BB */
 		strncpy(pSMB->FileName, searchName, name_len);
-		pSMB->FileName[name_len] = dirsep;
-		pSMB->FileName[name_len+1] = '*';
-		pSMB->FileName[name_len+2] = 0;
-		name_len += 3;
+		if (msearch) {
+			pSMB->FileName[name_len] = CIFS_DIR_SEP(cifs_sb);
+			pSMB->FileName[name_len+1] = '*';
+			pSMB->FileName[name_len+2] = 0;
+			name_len += 3;
+		}
 	}
 
 	params = 12 + name_len /* includes null */ ;
 	pSMB->TotalDataCount = 0;	/* no EAs */
 	pSMB->MaxParameterCount = cpu_to_le16(10);
-	pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf -
-					  MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3725,23 +4385,22 @@ findFirstRetry:
 	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
 			ATTR_DIRECTORY);
 	pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO));
-	pSMB->SearchFlags = cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END |
-		CIFS_SEARCH_RETURN_RESUME);
+	pSMB->SearchFlags = cpu_to_le16(search_flags);
 	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
 
 	/* BB what should we set StorageType to? Does it matter? BB */
 	pSMB->SearchStorageType = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_ffirst);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_ffirst);
 
 	if (rc) {/* BB add logic to retry regular search if Unix search
 			rejected unexpectedly by server */
 		/* BB Add code to handle unsupported level rc */
-		cFYI(1, "Error in FindFirst = %d", rc);
+		cifs_dbg(FYI, "Error in FindFirst = %d\n", rc);
 
 		cifs_buf_release(pSMB);
 
@@ -3778,9 +4437,8 @@ findFirstRetry:
 			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
 			lnoff = le16_to_cpu(parms->LastNameOffset);
-			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
-			      lnoff) {
-				cERROR(1, "ignoring corrupt resume name");
+			if (CIFSMaxBufSize < lnoff) {
+				cifs_dbg(VFS, "ignoring corrupt resume name\n");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			}
@@ -3788,7 +4446,8 @@ findFirstRetry:
 			psrch_inf->last_entry = psrch_inf->srch_entries_start +
 							lnoff;
 
-			*pnetfid = parms->SearchHandle;
+			if (pnetfid)
+				*pnetfid = parms->SearchHandle;
 		} else {
 			cifs_buf_release(pSMB);
 		}
@@ -3797,18 +4456,20 @@ findFirstRetry:
 	return rc;
 }
 
-int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
-		 __u16 searchHandle, struct cifs_search_info *psrch_inf)
+int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
+		 __u16 searchHandle, __u16 search_flags,
+		 struct cifs_search_info *psrch_inf)
 {
 	TRANSACTION2_FNEXT_REQ *pSMB = NULL;
 	TRANSACTION2_FNEXT_RSP *pSMBr = NULL;
 	T2_FNEXT_RSP_PARMS *parms;
 	char *response_data;
 	int rc = 0;
-	int bytes_returned, name_len;
+	int bytes_returned;
+	unsigned int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, "In FindNext");
+	cifs_dbg(FYI, "In FindNext\n");
 
 	if (psrch_inf->endOfSearch)
 		return -ENOENT;
@@ -3822,9 +4483,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	byte_count = 0;
 	pSMB->TotalDataCount = 0;       /* no EAs */
 	pSMB->MaxParameterCount = cpu_to_le16(8);
-	pSMB->MaxDataCount =
-		cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
-				0xFFFFFF00);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3842,8 +4501,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 		cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO));
 	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
 	pSMB->ResumeKey = psrch_inf->resume_key;
-	pSMB->SearchFlags =
-	      cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME);
+	pSMB->SearchFlags = cpu_to_le16(search_flags);
 
 	name_len = psrch_inf->resume_name_len;
 	params += name_len;
@@ -3860,19 +4518,19 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	byte_count = params + 1 /* pad */ ;
 	pSMB->TotalParameterCount = cpu_to_le16(params);
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->num_fnext);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_fnext);
 	if (rc) {
 		if (rc == -EBADF) {
 			psrch_inf->endOfSearch = true;
 			cifs_buf_release(pSMB);
 			rc = 0; /* search probably was closed at end of search*/
 		} else
-			cFYI(1, "FindNext returned = %d", rc);
+			cifs_dbg(FYI, "FindNext returned = %d\n", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3906,17 +4564,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 			psrch_inf->index_of_last_entry +=
 				psrch_inf->entries_in_buffer;
 			lnoff = le16_to_cpu(parms->LastNameOffset);
-			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
-			      lnoff) {
-				cERROR(1, "ignoring corrupt resume name");
+			if (CIFSMaxBufSize < lnoff) {
+				cifs_dbg(VFS, "ignoring corrupt resume name\n");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			} else
 				psrch_inf->last_entry =
 					psrch_inf->srch_entries_start + lnoff;
 
-/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
-	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
+/*  cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n",
+    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
 
 			/* BB fixme add unlock here */
 		}
@@ -3935,13 +4592,13 @@ FNext2_err_exit:
 }
 
 int
-CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
+CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
 	      const __u16 searchHandle)
 {
 	int rc = 0;
 	FINDCLOSE_REQ *pSMB = NULL;
 
-	cFYI(1, "In CIFSSMBFindClose");
+	cifs_dbg(FYI, "In CIFSSMBFindClose\n");
 	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
 
 	/* no sense returning error if session restarted
@@ -3953,11 +4610,11 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 
 	pSMB->FileID = searchHandle;
 	pSMB->ByteCount = 0;
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cERROR(1, "Send error in FindClose = %d", rc);
+		cifs_dbg(VFS, "Send error in FindClose = %d\n", rc);
 
-	cifs_stats_inc(&tcon->num_fclose);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose);
 
 	/* Since session is dead, search handle closed on server already */
 	if (rc == -EAGAIN)
@@ -3967,9 +4624,8 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 }
 
 int
-CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
-		      const unsigned char *searchName,
-		      __u64 *inode_number,
+CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *search_name, __u64 *inode_number,
 		      const struct nls_table *nls_codepage, int remap)
 {
 	int rc = 0;
@@ -3978,7 +4634,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
 	int name_len, bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, "In GetSrvInodeNum for %s", searchName);
+	cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name);
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -3990,14 +4646,15 @@ GetInodeNumberRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-					 PATH_MAX, nls_codepage, remap);
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   search_name, PATH_MAX, nls_codepage,
+					   remap);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
+		name_len = strnlen(search_name, PATH_MAX);
 		name_len++;     /* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+		strncpy(pSMB->FileName, search_name, name_len);
 	}
 
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
@@ -4022,18 +4679,18 @@ GetInodeNumberRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "error %d in QueryInternalInfo", rc);
+		cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
 			/* If rc should we check for EOPNOSUPP and
 			disable the srvino flag? or in caller? */
 			rc = -EIO;      /* bad smb */
@@ -4043,7 +4700,7 @@ GetInodeNumberRetry:
 			struct file_internal_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count < 8) {
-				cFYI(1, "Illegal size ret in QryIntrnlInf");
+				cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n");
 				rc = -EIO;
 				goto GetInodeNumOut;
 			}
@@ -4084,16 +4741,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
-		cERROR(1, "num_referrals: must be at least > 0,"
-			"but we get num_referrals = %d\n", *num_of_nodes);
+		cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n",
+			 *num_of_nodes);
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
 
 	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
 	if (ref->VersionNumber != cpu_to_le16(3)) {
-		cERROR(1, "Referrals of V%d version are not supported,"
-			"should be V3", le16_to_cpu(ref->VersionNumber));
+		cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n",
+			 le16_to_cpu(ref->VersionNumber));
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
@@ -4102,14 +4759,12 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	data_end = (char *)(&(pSMBr->PathConsumed)) +
 				le16_to_cpu(pSMBr->t2.DataCount);
 
-	cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
-			*num_of_nodes,
-			le32_to_cpu(pSMBr->DFSFlags));
+	cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n",
+		 *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags));
 
-	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
-			*num_of_nodes, GFP_KERNEL);
+	*target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param),
+				GFP_KERNEL);
 	if (*target_nodes == NULL) {
-		cERROR(1, "Failed to allocate buffer for target_nodes\n");
 		rc = -ENOMEM;
 		goto parse_DFS_referrals_exit;
 	}
@@ -4128,9 +4783,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 				rc = -ENOMEM;
 				goto parse_DFS_referrals_exit;
 			}
-			cifsConvertToUCS((__le16 *) tmp, searchName,
-					PATH_MAX, nls_codepage, remap);
-			node->path_consumed = cifs_ucs2_bytes(tmp,
+			cifsConvertToUTF16((__le16 *) tmp, searchName,
+					   PATH_MAX, nls_codepage, remap);
+			node->path_consumed = cifs_utf16_bytes(tmp,
 					le16_to_cpu(pSMBr->PathConsumed),
 					nls_codepage);
 			kfree(tmp);
@@ -4143,8 +4798,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy DfsPath */
 		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
 		max_len = data_end - temp;
-		node->path_name = cifs_strndup_from_ucs(temp, max_len,
-						      is_unicode, nls_codepage);
+		node->path_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
 		if (!node->path_name) {
 			rc = -ENOMEM;
 			goto parse_DFS_referrals_exit;
@@ -4153,10 +4808,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy link target UNC */
 		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
 		max_len = data_end - temp;
-		node->node_name = cifs_strndup_from_ucs(temp, max_len,
-						      is_unicode, nls_codepage);
-		if (!node->node_name)
+		node->node_name = cifs_strndup_from_utf16(temp, max_len,
+						is_unicode, nls_codepage);
+		if (!node->node_name) {
 			rc = -ENOMEM;
+			goto parse_DFS_referrals_exit;
+		}
+
+		ref++;
 	}
 
 parse_DFS_referrals_exit:
@@ -4169,9 +4828,8 @@ parse_DFS_referrals_exit:
 }
 
 int
-CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
-		const unsigned char *searchName,
-		struct dfs_info3_param **target_nodes,
+CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
+		const char *search_name, struct dfs_info3_param **target_nodes,
 		unsigned int *num_of_nodes,
 		const struct nls_table *nls_codepage, int remap)
 {
@@ -4185,7 +4843,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 	*num_of_nodes = 0;
 	*target_nodes = NULL;
 
-	cFYI(1, "In GetDFSRefer the path %s", searchName);
+	cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name);
 	if (ses == NULL)
 		return -ENODEV;
 getDFSRetry:
@@ -4196,7 +4854,7 @@ getDFSRetry:
 
 	/* server pointer checked in called function,
 	but should never be null here anyway */
-	pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Mid = get_next_mid(ses->server);
 	pSMB->hdr.Tid = ses->ipc_tid;
 	pSMB->hdr.Uid = ses->Suid;
 	if (ses->capabilities & CAP_STATUS32)
@@ -4207,21 +4865,19 @@ getDFSRetry:
 	if (ses->capabilities & CAP_UNICODE) {
 		pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
-				     searchName, PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+				       search_name, PATH_MAX, nls_codepage,
+				       remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
+		name_len = strnlen(search_name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->RequestFileName, searchName, name_len);
+		strncpy(pSMB->RequestFileName, search_name, name_len);
 	}
 
-	if (ses->server) {
-		if (ses->server->secMode &
-		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-	}
+	if (ses->server && ses->server->sign)
+		pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	pSMB->hdr.Uid = ses->Suid;
 
@@ -4246,31 +4902,30 @@ getDFSRetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->MaxReferralLevel = cpu_to_le16(3);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in GetDFSRefer = %d", rc);
+		cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc);
 		goto GetDFSRefExit;
 	}
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 	/* BB Also check if enough total bytes returned? */
-	if (rc || (pSMBr->ByteCount < 17)) {
+	if (rc || get_bcc(&pSMBr->hdr) < 17) {
 		rc = -EIO;      /* bad smb */
 		goto GetDFSRefExit;
 	}
 
-	cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
-				pSMBr->ByteCount,
-				le16_to_cpu(pSMBr->t2.DataOffset));
+	cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d  Offset %d\n",
+		 get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset));
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
 				 target_nodes, nls_codepage, remap,
-				 searchName);
+				 search_name);
 
 GetDFSRefExit:
 	cifs_buf_release(pSMB);
@@ -4283,7 +4938,8 @@ GetDFSRefExit:
 
 /* Query File System Info such as free space to old servers such as Win 9x */
 int
-SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
+SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
+	      struct kstatfs *FSData)
 {
 /* level 0x01 SMB_QUERY_FILE_SYSTEM_INFO */
 	TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4293,7 +4949,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "OldQFSInfo");
+	cifs_dbg(FYI, "OldQFSInfo\n");
 oldQFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		(void **) &pSMBr);
@@ -4320,22 +4976,22 @@ oldQFSInfoRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 18))
+		if (rc || get_bcc(&pSMBr->hdr) < 18)
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			cFYI(1, "qfsinf resp BCC: %d  Offset %d",
-				 pSMBr->ByteCount, data_offset);
+			cifs_dbg(FYI, "qfsinf resp BCC: %d  Offset %d\n",
+				 get_bcc(&pSMBr->hdr), data_offset);
 
 			response_data = (FILE_SYSTEM_ALLOC_INFO *)
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4347,10 +5003,10 @@ oldQFSInfoRetry:
 			       le32_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 				le32_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-			     (unsigned long long)FSData->f_blocks,
-			     (unsigned long long)FSData->f_bfree,
-			     FSData->f_bsize);
+			cifs_dbg(FYI, "Blocks: %lld  Free: %lld Block size %ld\n",
+				 (unsigned long long)FSData->f_blocks,
+				 (unsigned long long)FSData->f_bfree,
+				 FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4362,7 +5018,8 @@ oldQFSInfoRetry:
 }
 
 int
-CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
+CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
+	       struct kstatfs *FSData)
 {
 /* level 0x103 SMB_QUERY_FILE_SYSTEM_INFO */
 	TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4372,7 +5029,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSInfo");
+	cifs_dbg(FYI, "In QFSInfo\n");
 QFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4399,17 +5056,17 @@ QFSInfoRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 24))
+		if (rc || get_bcc(&pSMBr->hdr) < 24)
 			rc = -EIO;	/* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4426,10 +5083,10 @@ QFSInfoRetry:
 			    le64_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 			    le64_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-			     (unsigned long long)FSData->f_blocks,
-			     (unsigned long long)FSData->f_bfree,
-			     FSData->f_bsize);
+			cifs_dbg(FYI, "Blocks: %lld  Free: %lld Block size %ld\n",
+				 (unsigned long long)FSData->f_blocks,
+				 (unsigned long long)FSData->f_bfree,
+				 FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4441,7 +5098,7 @@ QFSInfoRetry:
 }
 
 int
-CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon)
 {
 /* level 0x105  SMB_QUERY_FILE_SYSTEM_INFO */
 	TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4451,7 +5108,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSAttributeInfo");
+	cifs_dbg(FYI, "In QFSAttributeInfo\n");
 QFSAttributeRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4479,17 +5136,17 @@ QFSAttributeRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
+		cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 13)) {
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
 			/* BB also check if enough bytes returned */
 			rc = -EIO;	/* bad smb */
 		} else {
@@ -4511,7 +5168,7 @@ QFSAttributeRetry:
 }
 
 int
-CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon)
 {
 /* level 0x104 SMB_QUERY_FILE_SYSTEM_INFO */
 	TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4521,7 +5178,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSDeviceInfo");
+	cifs_dbg(FYI, "In QFSDeviceInfo\n");
 QFSDeviceRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4550,17 +5207,18 @@ QFSDeviceRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO)))
+		if (rc || get_bcc(&pSMBr->hdr) <
+			  sizeof(FILE_SYSTEM_DEVICE_INFO))
 			rc = -EIO;	/* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4581,7 +5239,7 @@ QFSDeviceRetry:
 }
 
 int
-CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
+CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon)
 {
 /* level 0x200  SMB_QUERY_CIFS_UNIX_INFO */
 	TRANSACTION2_QFSI_REQ *pSMB = NULL;
@@ -4591,7 +5249,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSUnixInfo");
+	cifs_dbg(FYI, "In QFSUnixInfo\n");
 QFSUnixRetry:
 	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
 				   (void **) &pSMB, (void **) &pSMBr);
@@ -4619,17 +5277,17 @@ QFSUnixRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, "Send error in QFSUnixInfo = %d", rc);
+		cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 13)) {
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4651,7 +5309,7 @@ QFSUnixRetry:
 }
 
 int
-CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
+CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap)
 {
 /* level 0x200  SMB_SET_CIFS_UNIX_INFO */
 	TRANSACTION2_SETFSI_REQ *pSMB = NULL;
@@ -4660,7 +5318,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In SETFSUnixInfo");
+	cifs_dbg(FYI, "In SETFSUnixInfo\n");
 SETFSUnixRetry:
 	/* BB switch to small buf init to save memory */
 	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
@@ -4702,13 +5360,13 @@ SETFSUnixRetry:
 	pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION);
 	pSMB->ClientUnixCap = cpu_to_le64(cap);
 
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
+		cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc)
@@ -4725,7 +5383,7 @@ SETFSUnixRetry:
 
 
 int
-CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		   struct kstatfs *FSData)
 {
 /* level 0x201  SMB_QUERY_CIFS_POSIX_INFO */
@@ -4736,7 +5394,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSPosixInfo");
+	cifs_dbg(FYI, "In QFSPosixInfo\n");
 QFSPosixRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4764,17 +5422,17 @@ QFSPosixRetry:
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
 	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO);
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSUnixInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 13)) {
+		if (rc || get_bcc(&pSMBr->hdr) < 13) {
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4811,16 +5469,16 @@ QFSPosixRetry:
 }
 
 
-/* We can not use write of zero bytes trick to
-   set file size due to need for large file support.  Also note that
-   this SetPathInfo is preferred to SetFileInfo based method in next
-   routine which is only needed to work around a sharing violation bug
-   in Samba which this routine can run into */
-
+/*
+ * We can not use write of zero bytes trick to set file size due to need for
+ * large file support. Also note that this SetPathInfo is preferred to
+ * SetFileInfo based method in next routine which is only needed to work around
+ * a sharing violation bugin Samba which this routine can run into.
+ */
 int
-CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
-	      __u64 size, bool SetAllocation,
-	      const struct nls_table *nls_codepage, int remap)
+CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
+	      const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb,
+	      bool set_allocation)
 {
 	struct smb_com_transaction2_spi_req *pSMB = NULL;
 	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -4828,9 +5486,11 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int name_len;
 	int rc = 0;
 	int bytes_returned = 0;
+	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, "In SetEOF");
+	cifs_dbg(FYI, "In SetEOF\n");
 SetEOFRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4839,14 +5499,14 @@ SetEOFRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
+				       PATH_MAX, cifs_sb->local_nls, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
+		name_len = strnlen(file_name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, fileName, name_len);
+		strncpy(pSMB->FileName, file_name, name_len);
 	}
 	params = 6 + name_len;
 	data_count = sizeof(struct file_end_of_file_info);
@@ -4860,7 +5520,7 @@ SetEOFRetry:
 	param_offset = offsetof(struct smb_com_transaction2_spi_req,
 				InformationLevel) - 4;
 	offset = param_offset + params;
-	if (SetAllocation) {
+	if (set_allocation) {
 		if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
 			pSMB->InformationLevel =
 				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
@@ -4890,13 +5550,13 @@ SetEOFRetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	parm_data->FileSize = cpu_to_le64(size);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (file size) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -4907,23 +5567,23 @@ SetEOFRetry:
 }
 
 int
-CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
-		   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
+CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile, __u64 size, bool set_allocation)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	struct file_end_of_file_info *parm_data;
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "SetFileSize (via SetFileInfo) %lld",
-			(long long)size);
+	cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n",
+		 (long long)size);
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
 		return rc;
 
-	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
-	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+	pSMB->hdr.Pid = cpu_to_le16((__u16)cfile->pid);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(cfile->pid >> 16));
 
 	params = 6;
 	pSMB->MaxSetupCount = 0;
@@ -4952,8 +5612,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 				+ offset);
 	pSMB->DataOffset = cpu_to_le16(offset);
 	parm_data->FileSize = cpu_to_le64(size);
-	pSMB->Fid = fid;
-	if (SetAllocation) {
+	pSMB->Fid = cfile->fid.netfid;
+	if (set_allocation) {
 		if (tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)
 			pSMB->InformationLevel =
 				cpu_to_le16(SMB_SET_FILE_ALLOCATION_INFO2);
@@ -4969,11 +5629,12 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 				cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
 	}
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc) {
-		cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
+		cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n",
+			 rc);
 	}
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4989,7 +5650,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
    time and resort to the original setpathinfo level which takes the ancient
    DOS time format with 2 second granularity */
 int
-CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -4997,7 +5658,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "Set Times (via SetFileInfo)");
+	cifs_dbg(FYI, "Set Times (via SetFileInfo)\n");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5015,7 +5676,8 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
 	offset = param_offset + params;
 
-	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	data_offset = (char *)pSMB +
+			offsetof(struct smb_hdr, Protocol) + offset;
 
 	count = sizeof(FILE_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -5037,12 +5699,13 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	else
 		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+			 rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5051,7 +5714,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 }
 
 int
-CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
 			  bool delete_file, __u16 fid, __u32 pid_of_opener)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -5059,7 +5722,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "Set File Disposition (via SetFileInfo)");
+	cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5096,18 +5759,18 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	pSMB->Fid = fid;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	*data_offset = delete_file ? 1 : 0;
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cFYI(1, "Send error in SetFileDisposition = %d", rc);
+		cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc);
 
 	return rc;
 }
 
 int
-CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		   const char *fileName, const FILE_BASIC_INFO *data,
 		   const struct nls_table *nls_codepage, int remap)
 {
@@ -5119,7 +5782,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
 	char *data_offset;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "In SetTimes");
+	cifs_dbg(FYI, "In SetTimes\n");
 
 SetTimesRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5129,8 +5792,8 @@ SetTimesRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5169,13 +5832,13 @@ SetTimesRetry:
 	else
 		pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (times) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5191,7 +5854,7 @@ SetTimesRetry:
 	  handling it anyway and NT4 was what we thought it would be needed for
 	  Do not delete it until we prove whether needed for Win9x though */
 int
-CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName,
 		__u16 dos_attrs, const struct nls_table *nls_codepage)
 {
 	SETATTR_REQ *pSMB = NULL;
@@ -5200,7 +5863,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, "In SetAttrLegacy");
+	cifs_dbg(FYI, "In SetAttrLegacy\n");
 
 SetAttrLgcyRetry:
 	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5210,8 +5873,8 @@ SetAttrLgcyRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-			ConvertToUCS((__le16 *) pSMB->fileName, fileName,
-				PATH_MAX, nls_codepage);
+			ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+				       PATH_MAX, nls_codepage);
 		name_len++;     /* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5221,12 +5884,12 @@ SetAttrLgcyRetry:
 	}
 	pSMB->attr = cpu_to_le16(dos_attrs);
 	pSMB->BufferFormat = 0x04;
-	pSMB->hdr.smb_buf_length += name_len + 1;
+	inc_rfc1001_len(pSMB, name_len + 1);
 	pSMB->ByteCount = cpu_to_le16(name_len + 1);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "Error in LegacySetAttr = %d", rc);
+		cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5241,13 +5904,19 @@ static void
 cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 			const struct cifs_unix_set_info_args *args)
 {
+	u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64;
 	u64 mode = args->mode;
 
+	if (uid_valid(args->uid))
+		uid = from_kuid(&init_user_ns, args->uid);
+	if (gid_valid(args->gid))
+		gid = from_kgid(&init_user_ns, args->gid);
+
 	/*
 	 * Samba server ignores set of file size to zero due to bugs in some
 	 * older clients, but we should be precise - we use SetFileSize to
 	 * set file size and do not want to truncate file size to zero
-	 * accidently as happened on one Samba server beta by putting
+	 * accidentally as happened on one Samba server beta by putting
 	 * zero instead of -1 here
 	 */
 	data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
@@ -5255,8 +5924,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 	data_offset->LastStatusChange = cpu_to_le64(args->ctime);
 	data_offset->LastAccessTime = cpu_to_le64(args->atime);
 	data_offset->LastModificationTime = cpu_to_le64(args->mtime);
-	data_offset->Uid = cpu_to_le64(args->uid);
-	data_offset->Gid = cpu_to_le64(args->gid);
+	data_offset->Uid = cpu_to_le64(uid);
+	data_offset->Gid = cpu_to_le64(gid);
 	/* better to leave device as zero when it is  */
 	data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
 	data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
@@ -5279,16 +5948,16 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 }
 
 int
-CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 		       const struct cifs_unix_set_info_args *args,
 		       u16 fid, u32 pid_of_opener)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-	FILE_UNIX_BASIC_INFO *data_offset;
+	char *data_offset;
 	int rc = 0;
 	u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "Set Unix Info (via SetFileInfo)");
+	cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5306,8 +5975,9 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
 	offset = param_offset + params;
 
-	data_offset = (FILE_UNIX_BASIC_INFO *)
-				((char *)(&pSMB->hdr.Protocol) + offset);
+	data_offset = (char *)pSMB +
+			offsetof(struct smb_hdr, Protocol) + offset;
+
 	count = sizeof(FILE_UNIX_BASIC_INFO);
 
 	pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -5326,14 +5996,15 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	pSMB->Fid = fid;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
-	cifs_fill_unix_set_info(data_offset, args);
+	cifs_fill_unix_set_info((FILE_UNIX_BASIC_INFO *)data_offset, args);
 
-	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+			 rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5342,7 +6013,8 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 }
 
 int
-CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
+		       const char *file_name,
 		       const struct cifs_unix_set_info_args *args,
 		       const struct nls_table *nls_codepage, int remap)
 {
@@ -5354,7 +6026,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
 	FILE_UNIX_BASIC_INFO *data_offset;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, "In SetUID/GID/Mode");
+	cifs_dbg(FYI, "In SetUID/GID/Mode\n");
 setPermsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5363,14 +6035,14 @@ setPermsRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(fileName, PATH_MAX);
+		name_len = strnlen(file_name, PATH_MAX);
 		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, fileName, name_len);
+		strncpy(pSMB->FileName, file_name, name_len);
 	}
 
 	params = 6 + name_len;
@@ -5402,7 +6074,7 @@ setPermsRetry:
 	pSMB->TotalDataCount = pSMB->DataCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 
 	cifs_fill_unix_set_info(data_offset, args);
 
@@ -5410,7 +6082,7 @@ setPermsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (perms) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -5418,79 +6090,6 @@ setPermsRetry:
 	return rc;
 }
 
-int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
-		  const int notify_subdirs, const __u16 netfid,
-		  __u32 filter, struct file *pfile, int multishot,
-		  const struct nls_table *nls_codepage)
-{
-	int rc = 0;
-	struct smb_com_transaction_change_notify_req *pSMB = NULL;
-	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
-	struct dir_notify_req *dnotify_req;
-	int bytes_returned;
-
-	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
-	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
-	if (rc)
-		return rc;
-
-	pSMB->TotalParameterCount = 0 ;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le32(2);
-	/* BB find exact data count max from sess structure BB */
-	pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
-	pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-					     MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-
-	pSMB->MaxSetupCount = 4;
-	pSMB->Reserved = 0;
-	pSMB->ParameterOffset = 0;
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 4; /* single byte does not need le conversion */
-	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	if (notify_subdirs)
-		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
-	pSMB->Reserved2 = 0;
-	pSMB->CompletionFilter = cpu_to_le32(filter);
-	pSMB->Fid = netfid; /* file handle always le */
-	pSMB->ByteCount = 0;
-
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *)pSMBr, &bytes_returned,
-			 CIFS_ASYNC_OP);
-	if (rc) {
-		cFYI(1, "Error in Notify = %d", rc);
-	} else {
-		/* Add file to outstanding requests */
-		/* BB change to kmem cache alloc */
-		dnotify_req = kmalloc(
-						sizeof(struct dir_notify_req),
-						 GFP_KERNEL);
-		if (dnotify_req) {
-			dnotify_req->Pid = pSMB->hdr.Pid;
-			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
-			dnotify_req->Mid = pSMB->hdr.Mid;
-			dnotify_req->Tid = pSMB->hdr.Tid;
-			dnotify_req->Uid = pSMB->hdr.Uid;
-			dnotify_req->netfid = netfid;
-			dnotify_req->pfile = pfile;
-			dnotify_req->filter = filter;
-			dnotify_req->multishot = multishot;
-			spin_lock(&GlobalMid_Lock);
-			list_add_tail(&dnotify_req->lhead,
-					&GlobalDnotifyReqList);
-			spin_unlock(&GlobalMid_Lock);
-		} else
-			rc = -ENOMEM;
-	}
-	cifs_buf_release(pSMB);
-	return rc;
-}
-
 #ifdef CONFIG_CIFS_XATTR
 /*
  * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
@@ -5502,7 +6101,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
  * the data isn't copied to it, but the length is returned.
  */
 ssize_t
-CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
 		const unsigned char *searchName, const unsigned char *ea_name,
 		char *EAData, size_t buf_size,
 		const struct nls_table *nls_codepage, int remap)
@@ -5518,8 +6117,9 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
 	char *temp_ptr;
 	char *end_of_smb;
 	__u16 params, byte_count, data_offset;
+	unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
 
-	cFYI(1, "In Query All EAs path %s", searchName);
+	cifs_dbg(FYI, "In Query All EAs path %s\n", searchName);
 QAllEAsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5528,8 +6128,8 @@ QAllEAsRetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		list_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+				       PATH_MAX, nls_codepage, remap);
 		list_len++;	/* trailing null */
 		list_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5560,13 +6160,13 @@ QAllEAsRetry:
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
 	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QueryAllEAs = %d", rc);
+		cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc);
 		goto QAllEAsOut;
 	}
 
@@ -5576,7 +6176,7 @@ QAllEAsRetry:
 	of these trans2 responses */
 
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-	if (rc || (pSMBr->ByteCount < 4)) {
+	if (rc || get_bcc(&pSMBr->hdr) < 4) {
 		rc = -EIO;	/* bad smb */
 		goto QAllEAsOut;
 	}
@@ -5594,16 +6194,19 @@ QAllEAsRetry:
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
 
 	list_len = le32_to_cpu(ea_response_data->list_len);
-	cFYI(1, "ea length %d", list_len);
+	cifs_dbg(FYI, "ea length %d\n", list_len);
 	if (list_len <= 8) {
-		cFYI(1, "empty EA list returned from server");
+		cifs_dbg(FYI, "empty EA list returned from server\n");
+		/* didn't find the named attribute */
+		if (ea_name)
+			rc = -ENODATA;
 		goto QAllEAsOut;
 	}
 
 	/* make sure list_len doesn't go past end of SMB */
 	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
 	if ((char *)ea_response_data + list_len > end_of_smb) {
-		cFYI(1, "EA list appears to go beyond SMB");
+		cifs_dbg(FYI, "EA list appears to go beyond SMB\n");
 		rc = -EIO;
 		goto QAllEAsOut;
 	}
@@ -5620,7 +6223,7 @@ QAllEAsRetry:
 		temp_ptr += 4;
 		/* make sure we can read name_len and value_len */
 		if (list_len < 0) {
-			cFYI(1, "EA entry goes beyond length of list");
+			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
 			rc = -EIO;
 			goto QAllEAsOut;
 		}
@@ -5629,13 +6232,14 @@ QAllEAsRetry:
 		value_len = le16_to_cpu(temp_fea->value_len);
 		list_len -= name_len + 1 + value_len;
 		if (list_len < 0) {
-			cFYI(1, "EA entry goes beyond length of list");
+			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
 			rc = -EIO;
 			goto QAllEAsOut;
 		}
 
 		if (ea_name) {
-			if (strncmp(ea_name, temp_ptr, name_len) == 0) {
+			if (ea_name_len == name_len &&
+			    memcmp(ea_name, temp_ptr, name_len) == 0) {
 				temp_ptr += name_len + 1;
 				rc = value_len;
 				if (buf_size == 0)
@@ -5683,8 +6287,8 @@ QAllEAsOut:
 }
 
 int
-CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
-	     const char *ea_name, const void *ea_value,
+CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
+	     const char *fileName, const char *ea_name, const void *ea_value,
 	     const __u16 ea_value_len, const struct nls_table *nls_codepage,
 	     int remap)
 {
@@ -5696,7 +6300,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, param_offset, byte_count, offset, count;
 
-	cFYI(1, "In SetEA");
+	cifs_dbg(FYI, "In SetEA\n");
 SetEARetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5705,8 +6309,8 @@ SetEARetry:
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
 		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
-				     PATH_MAX, nls_codepage, remap);
+		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+				       PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
@@ -5773,12 +6377,12 @@ SetEARetry:
 	pSMB->ParameterCount = cpu_to_le16(params);
 	pSMB->TotalParameterCount = pSMB->ParameterCount;
 	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
+	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (EA) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5787,5 +6391,94 @@ SetEARetry:
 
 	return rc;
 }
-
 #endif
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
+/*
+ *	Years ago the kernel added a "dnotify" function for Samba server,
+ *	to allow network clients (such as Windows) to display updated
+ *	lists of files in directory listings automatically when
+ *	files are added by one user when another user has the
+ *	same directory open on their desktop.  The Linux cifs kernel
+ *	client hooked into the kernel side of this interface for
+ *	the same reason, but ironically when the VFS moved from
+ *	"dnotify" to "inotify" it became harder to plug in Linux
+ *	network file system clients (the most obvious use case
+ *	for notify interfaces is when multiple users can update
+ *	the contents of the same directory - exactly what network
+ *	file systems can do) although the server (Samba) could
+ *	still use it.  For the short term we leave the worker
+ *	function ifdeffed out (below) until inotify is fixed
+ *	in the VFS to make it easier to plug in network file
+ *	system clients.  If inotify turns out to be permanently
+ *	incompatible for network fs clients, we could instead simply
+ *	expose this config flag by adding a future cifs (and smb2) notify ioctl.
+ */
+int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
+		  const int notify_subdirs, const __u16 netfid,
+		  __u32 filter, struct file *pfile, int multishot,
+		  const struct nls_table *nls_codepage)
+{
+	int rc = 0;
+	struct smb_com_transaction_change_notify_req *pSMB = NULL;
+	struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
+	struct dir_notify_req *dnotify_req;
+	int bytes_returned;
+
+	cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->TotalParameterCount = 0 ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le32(2);
+	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 4; /* single byte does not need le conversion */
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	if (notify_subdirs)
+		pSMB->WatchTree = 1; /* one byte - no le conversion needed */
+	pSMB->Reserved2 = 0;
+	pSMB->CompletionFilter = cpu_to_le32(filter);
+	pSMB->Fid = netfid; /* file handle always le */
+	pSMB->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *)pSMBr, &bytes_returned,
+			 CIFS_ASYNC_OP);
+	if (rc) {
+		cifs_dbg(FYI, "Error in Notify = %d\n", rc);
+	} else {
+		/* Add file to outstanding requests */
+		/* BB change to kmem cache alloc */
+		dnotify_req = kmalloc(
+						sizeof(struct dir_notify_req),
+						 GFP_KERNEL);
+		if (dnotify_req) {
+			dnotify_req->Pid = pSMB->hdr.Pid;
+			dnotify_req->PidHigh = pSMB->hdr.PidHigh;
+			dnotify_req->Mid = pSMB->hdr.Mid;
+			dnotify_req->Tid = pSMB->hdr.Tid;
+			dnotify_req->Uid = pSMB->hdr.Uid;
+			dnotify_req->netfid = netfid;
+			dnotify_req->pfile = pfile;
+			dnotify_req->filter = filter;
+			dnotify_req->multishot = multishot;
+			spin_lock(&GlobalMid_Lock);
+			list_add_tail(&dnotify_req->lhead,
+					&GlobalDnotifyReqList);
+			spin_unlock(&GlobalMid_Lock);
+		} else
+			rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	return rc;
+}
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8d6c17ab593..20d75b8ddb2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/connect.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2011
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -37,7 +37,11 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
+#include <linux/module.h>
+#include <keys/user-type.h>
 #include <net/ipv6.h>
+#include <linux/parser.h>
+
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -52,74 +56,235 @@
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
 
-/* SMB echo "timeout" -- FIXME: tunable? */
-#define SMB_ECHO_INTERVAL (60 * HZ)
-
 extern mempool_t *cifs_req_poolp;
 
-struct smb_vol {
-	char *username;
-	char *password;
-	char *domainname;
-	char *UNC;
-	char *UNCip;
-	char *iocharset;  /* local code page for mapping to and from Unicode */
-	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
-	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
-	uid_t cred_uid;
-	uid_t linux_uid;
-	gid_t linux_gid;
-	mode_t file_mode;
-	mode_t dir_mode;
-	unsigned secFlg;
-	bool retry:1;
-	bool intr:1;
-	bool setuids:1;
-	bool override_uid:1;
-	bool override_gid:1;
-	bool dynperm:1;
-	bool noperm:1;
-	bool no_psx_acl:1; /* set if posix acl support should be disabled */
-	bool cifs_acl:1;
-	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
-	bool server_ino:1; /* use inode numbers from server ie UniqueId */
-	bool direct_io:1;
-	bool strict_io:1; /* strict cache behavior */
-	bool remap:1;      /* set to remap seven reserved chars in filenames */
-	bool posix_paths:1; /* unset to not ask for posix pathnames. */
-	bool no_linux_ext:1;
-	bool sfu_emul:1;
-	bool nullauth:1;   /* attempt to authenticate with null user */
-	bool nocase:1;     /* request case insensitive filenames */
-	bool nobrl:1;      /* disable sending byte range locks to srv */
-	bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
-	bool seal:1;       /* request transport encryption on share */
-	bool nodfs:1;      /* Do not request DFS, even if available */
-	bool local_lease:1; /* check leases only on local system, not remote */
-	bool noblocksnd:1;
-	bool noautotune:1;
-	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
-	bool fsc:1;	/* enable fscache */
-	bool mfsymlinks:1; /* use Minshall+French Symlinks */
-	bool multiuser:1;
-	unsigned int rsize;
-	unsigned int wsize;
-	bool sockopt_tcp_nodelay:1;
-	unsigned short int port;
-	unsigned long actimeo; /* attribute cache timeout (jiffies) */
-	char *prepath;
-	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
-	struct nls_table *local_nls;
-};
-
 /* FIXME: should these be tunable? */
 #define TLINK_ERROR_EXPIRE	(1 * HZ)
 #define TLINK_IDLE_EXPIRE	(600 * HZ)
 
+enum {
+
+	/* Mount options that take no arguments */
+	Opt_user_xattr, Opt_nouser_xattr,
+	Opt_forceuid, Opt_noforceuid,
+	Opt_forcegid, Opt_noforcegid,
+	Opt_noblocksend, Opt_noautotune,
+	Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
+	Opt_mapchars, Opt_nomapchars, Opt_sfu,
+	Opt_nosfu, Opt_nodfs, Opt_posixpaths,
+	Opt_noposixpaths, Opt_nounix,
+	Opt_nocase,
+	Opt_brl, Opt_nobrl,
+	Opt_forcemandatorylock, Opt_setuids,
+	Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
+	Opt_nohard, Opt_nosoft,
+	Opt_nointr, Opt_intr,
+	Opt_nostrictsync, Opt_strictsync,
+	Opt_serverino, Opt_noserverino,
+	Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl,
+	Opt_acl, Opt_noacl, Opt_locallease,
+	Opt_sign, Opt_seal, Opt_noac,
+	Opt_fsc, Opt_mfsymlinks,
+	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+
+	/* Mount options which take numeric value */
+	Opt_backupuid, Opt_backupgid, Opt_uid,
+	Opt_cruid, Opt_gid, Opt_file_mode,
+	Opt_dirmode, Opt_port,
+	Opt_rsize, Opt_wsize, Opt_actimeo,
+
+	/* Mount options which take string value */
+	Opt_user, Opt_pass, Opt_ip,
+	Opt_domain, Opt_srcaddr, Opt_iocharset,
+	Opt_netbiosname, Opt_servern,
+	Opt_ver, Opt_vers, Opt_sec, Opt_cache,
+
+	/* Mount options to be ignored */
+	Opt_ignore,
+
+	/* Options which could be blank */
+	Opt_blank_pass,
+	Opt_blank_user,
+	Opt_blank_ip,
+
+	Opt_err
+};
+
+static const match_table_t cifs_mount_option_tokens = {
+
+	{ Opt_user_xattr, "user_xattr" },
+	{ Opt_nouser_xattr, "nouser_xattr" },
+	{ Opt_forceuid, "forceuid" },
+	{ Opt_noforceuid, "noforceuid" },
+	{ Opt_forcegid, "forcegid" },
+	{ Opt_noforcegid, "noforcegid" },
+	{ Opt_noblocksend, "noblocksend" },
+	{ Opt_noautotune, "noautotune" },
+	{ Opt_hard, "hard" },
+	{ Opt_soft, "soft" },
+	{ Opt_perm, "perm" },
+	{ Opt_noperm, "noperm" },
+	{ Opt_mapchars, "mapchars" },
+	{ Opt_nomapchars, "nomapchars" },
+	{ Opt_sfu, "sfu" },
+	{ Opt_nosfu, "nosfu" },
+	{ Opt_nodfs, "nodfs" },
+	{ Opt_posixpaths, "posixpaths" },
+	{ Opt_noposixpaths, "noposixpaths" },
+	{ Opt_nounix, "nounix" },
+	{ Opt_nounix, "nolinux" },
+	{ Opt_nocase, "nocase" },
+	{ Opt_nocase, "ignorecase" },
+	{ Opt_brl, "brl" },
+	{ Opt_nobrl, "nobrl" },
+	{ Opt_nobrl, "nolock" },
+	{ Opt_forcemandatorylock, "forcemandatorylock" },
+	{ Opt_forcemandatorylock, "forcemand" },
+	{ Opt_setuids, "setuids" },
+	{ Opt_nosetuids, "nosetuids" },
+	{ Opt_dynperm, "dynperm" },
+	{ Opt_nodynperm, "nodynperm" },
+	{ Opt_nohard, "nohard" },
+	{ Opt_nosoft, "nosoft" },
+	{ Opt_nointr, "nointr" },
+	{ Opt_intr, "intr" },
+	{ Opt_nostrictsync, "nostrictsync" },
+	{ Opt_strictsync, "strictsync" },
+	{ Opt_serverino, "serverino" },
+	{ Opt_noserverino, "noserverino" },
+	{ Opt_rwpidforward, "rwpidforward" },
+	{ Opt_cifsacl, "cifsacl" },
+	{ Opt_nocifsacl, "nocifsacl" },
+	{ Opt_acl, "acl" },
+	{ Opt_noacl, "noacl" },
+	{ Opt_locallease, "locallease" },
+	{ Opt_sign, "sign" },
+	{ Opt_seal, "seal" },
+	{ Opt_noac, "noac" },
+	{ Opt_fsc, "fsc" },
+	{ Opt_mfsymlinks, "mfsymlinks" },
+	{ Opt_multiuser, "multiuser" },
+	{ Opt_sloppy, "sloppy" },
+	{ Opt_nosharesock, "nosharesock" },
+
+	{ Opt_backupuid, "backupuid=%s" },
+	{ Opt_backupgid, "backupgid=%s" },
+	{ Opt_uid, "uid=%s" },
+	{ Opt_cruid, "cruid=%s" },
+	{ Opt_gid, "gid=%s" },
+	{ Opt_file_mode, "file_mode=%s" },
+	{ Opt_dirmode, "dirmode=%s" },
+	{ Opt_dirmode, "dir_mode=%s" },
+	{ Opt_port, "port=%s" },
+	{ Opt_rsize, "rsize=%s" },
+	{ Opt_wsize, "wsize=%s" },
+	{ Opt_actimeo, "actimeo=%s" },
+
+	{ Opt_blank_user, "user=" },
+	{ Opt_blank_user, "username=" },
+	{ Opt_user, "user=%s" },
+	{ Opt_user, "username=%s" },
+	{ Opt_blank_pass, "pass=" },
+	{ Opt_blank_pass, "password=" },
+	{ Opt_pass, "pass=%s" },
+	{ Opt_pass, "password=%s" },
+	{ Opt_blank_ip, "ip=" },
+	{ Opt_blank_ip, "addr=" },
+	{ Opt_ip, "ip=%s" },
+	{ Opt_ip, "addr=%s" },
+	{ Opt_ignore, "unc=%s" },
+	{ Opt_ignore, "target=%s" },
+	{ Opt_ignore, "path=%s" },
+	{ Opt_domain, "dom=%s" },
+	{ Opt_domain, "domain=%s" },
+	{ Opt_domain, "workgroup=%s" },
+	{ Opt_srcaddr, "srcaddr=%s" },
+	{ Opt_ignore, "prefixpath=%s" },
+	{ Opt_iocharset, "iocharset=%s" },
+	{ Opt_netbiosname, "netbiosname=%s" },
+	{ Opt_servern, "servern=%s" },
+	{ Opt_ver, "ver=%s" },
+	{ Opt_vers, "vers=%s" },
+	{ Opt_sec, "sec=%s" },
+	{ Opt_cache, "cache=%s" },
+
+	{ Opt_ignore, "cred" },
+	{ Opt_ignore, "credentials" },
+	{ Opt_ignore, "cred=%s" },
+	{ Opt_ignore, "credentials=%s" },
+	{ Opt_ignore, "guest" },
+	{ Opt_ignore, "rw" },
+	{ Opt_ignore, "ro" },
+	{ Opt_ignore, "suid" },
+	{ Opt_ignore, "nosuid" },
+	{ Opt_ignore, "exec" },
+	{ Opt_ignore, "noexec" },
+	{ Opt_ignore, "nodev" },
+	{ Opt_ignore, "noauto" },
+	{ Opt_ignore, "dev" },
+	{ Opt_ignore, "mand" },
+	{ Opt_ignore, "nomand" },
+	{ Opt_ignore, "_netdev" },
+
+	{ Opt_err, NULL }
+};
+
+enum {
+	Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
+	Opt_sec_ntlmsspi, Opt_sec_ntlmssp,
+	Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2,
+	Opt_sec_ntlmv2i, Opt_sec_lanman,
+	Opt_sec_none,
+
+	Opt_sec_err
+};
+
+static const match_table_t cifs_secflavor_tokens = {
+	{ Opt_sec_krb5, "krb5" },
+	{ Opt_sec_krb5i, "krb5i" },
+	{ Opt_sec_krb5p, "krb5p" },
+	{ Opt_sec_ntlmsspi, "ntlmsspi" },
+	{ Opt_sec_ntlmssp, "ntlmssp" },
+	{ Opt_ntlm, "ntlm" },
+	{ Opt_sec_ntlmi, "ntlmi" },
+	{ Opt_sec_ntlmv2, "nontlm" },
+	{ Opt_sec_ntlmv2, "ntlmv2" },
+	{ Opt_sec_ntlmv2i, "ntlmv2i" },
+	{ Opt_sec_lanman, "lanman" },
+	{ Opt_sec_none, "none" },
+
+	{ Opt_sec_err, NULL }
+};
+
+/* cache flavors */
+enum {
+	Opt_cache_loose,
+	Opt_cache_strict,
+	Opt_cache_none,
+	Opt_cache_err
+};
+
+static const match_table_t cifs_cacheflavor_tokens = {
+	{ Opt_cache_loose, "loose" },
+	{ Opt_cache_strict, "strict" },
+	{ Opt_cache_none, "none" },
+	{ Opt_cache_err, NULL }
+};
+
+static const match_table_t cifs_smb_version_tokens = {
+	{ Smb_1, SMB1_VERSION_STRING },
+	{ Smb_20, SMB20_VERSION_STRING},
+	{ Smb_21, SMB21_VERSION_STRING },
+	{ Smb_30, SMB30_VERSION_STRING },
+	{ Smb_302, SMB302_VERSION_STRING },
+};
+
 static int ip_connect(struct TCP_Server_Info *server);
 static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
+static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+					const char *devname);
 
 /*
  * cifs tcp session reconnection
@@ -129,14 +294,15 @@ static void cifs_prune_tlinks(struct work_struct *work);
  * reconnect tcp session
  * wake up waiters on reconnection? - (not needed currently)
  */
-static int
+int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
 	struct list_head *tmp, *tmp2;
-	struct cifsSesInfo *ses;
-	struct cifsTconInfo *tcon;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
 	struct mid_q_entry *mid_entry;
+	struct list_head retry_list;
 
 	spin_lock(&GlobalMid_Lock);
 	if (server->tcpStatus == CifsExiting) {
@@ -148,34 +314,37 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		server->tcpStatus = CifsNeedReconnect;
 	spin_unlock(&GlobalMid_Lock);
 	server->maxBuf = 0;
+#ifdef CONFIG_CIFS_SMB2
+	server->max_read = 0;
+#endif
 
-	cFYI(1, "Reconnecting tcp session");
+	cifs_dbg(FYI, "Reconnecting tcp session\n");
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
+	cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n",
+		 __func__);
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
-		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
 		ses->need_reconnect = true;
 		ses->ipc_tid = 0;
 		list_for_each(tmp2, &ses->tcon_list) {
-			tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
+			tcon = list_entry(tmp2, struct cifs_tcon, tcon_list);
 			tcon->need_reconnect = true;
 		}
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
 
 	/* do not want to be sending data on a socket we are freeing */
-	cFYI(1, "%s: tearing down socket", __func__);
+	cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
-		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
-			server->ssocket->flags);
+		cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
+			 server->ssocket->state, server->ssocket->flags);
 		kernel_sock_shutdown(server->ssocket, SHUT_WR);
-		cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
-			server->ssocket->state,
-			server->ssocket->flags);
+		cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
+			 server->ssocket->state, server->ssocket->flags);
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
@@ -188,507 +357,338 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	mutex_unlock(&server->srv_mutex);
 
 	/* mark submitted MIDs for retry and issue callback */
-	cFYI(1, "%s: issuing mid callbacks", __func__);
+	INIT_LIST_HEAD(&retry_list);
+	cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		if (mid_entry->midState == MID_REQUEST_SUBMITTED)
-			mid_entry->midState = MID_RETRY_NEEDED;
+		if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
+			mid_entry->mid_state = MID_RETRY_NEEDED;
+		list_move(&mid_entry->qhead, &retry_list);
+	}
+	spin_unlock(&GlobalMid_Lock);
+
+	cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
+	list_for_each_safe(tmp, tmp2, &retry_list) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 		list_del_init(&mid_entry->qhead);
 		mid_entry->callback(mid_entry);
 	}
-	spin_unlock(&GlobalMid_Lock);
 
-	while ((server->tcpStatus != CifsExiting) &&
-	       (server->tcpStatus != CifsGood)) {
+	do {
 		try_to_freeze();
 
 		/* we should try only the port we connected to before */
+		mutex_lock(&server->srv_mutex);
 		rc = generic_ip_connect(server);
 		if (rc) {
-			cFYI(1, "reconnect error %d", rc);
+			cifs_dbg(FYI, "reconnect error %d\n", rc);
 			msleep(3000);
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
 			spin_lock(&GlobalMid_Lock);
 			if (server->tcpStatus != CifsExiting)
-				server->tcpStatus = CifsGood;
+				server->tcpStatus = CifsNeedNegotiate;
 			spin_unlock(&GlobalMid_Lock);
 		}
-	}
+		mutex_unlock(&server->srv_mutex);
+	} while (server->tcpStatus == CifsNeedReconnect);
 
 	return rc;
 }
 
-/*
-	return codes:
-		0 	not a transact2, or all data present
-		>0 	transact2 with that much data missing
-		-EINVAL = invalid transact2
-
- */
-static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
+static void
+cifs_echo_request(struct work_struct *work)
 {
-	struct smb_t2_rsp *pSMBt;
-	int remaining;
-	__u16 total_data_size, data_in_this_rsp;
-
-	if (pSMB->Command != SMB_COM_TRANSACTION2)
-		return 0;
+	int rc;
+	struct TCP_Server_Info *server = container_of(work,
+					struct TCP_Server_Info, echo.work);
 
-	/* check for plausible wct, bcc and t2 data and parm sizes */
-	/* check for parm and data offset going beyond end of smb */
-	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-		cFYI(1, "invalid transact2 word count");
-		return -EINVAL;
-	}
+	/*
+	 * We cannot send an echo if it is disabled or until the
+	 * NEGOTIATE_PROTOCOL request is done, which is indicated by
+	 * server->ops->need_neg() == true. Also, no need to ping if
+	 * we got a response recently.
+	 */
+	if (!server->ops->need_neg || server->ops->need_neg(server) ||
+	    (server->ops->can_echo && !server->ops->can_echo(server)) ||
+	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+		goto requeue_echo;
 
-	pSMBt = (struct smb_t2_rsp *)pSMB;
+	rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
+	if (rc)
+		cifs_dbg(FYI, "Unable to send echo request to server: %s\n",
+			 server->hostname);
 
-	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-	data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+requeue_echo:
+	queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
+}
 
-	remaining = total_data_size - data_in_this_rsp;
+static bool
+allocate_buffers(struct TCP_Server_Info *server)
+{
+	if (!server->bigbuf) {
+		server->bigbuf = (char *)cifs_buf_get();
+		if (!server->bigbuf) {
+			cifs_dbg(VFS, "No memory for large SMB response\n");
+			msleep(3000);
+			/* retry will check if exiting */
+			return false;
+		}
+	} else if (server->large_buf) {
+		/* we are reusing a dirty large buf, clear its start */
+		memset(server->bigbuf, 0, HEADER_SIZE(server));
+	}
 
-	if (remaining == 0)
-		return 0;
-	else if (remaining < 0) {
-		cFYI(1, "total data %d smaller than data in frame %d",
-			total_data_size, data_in_this_rsp);
-		return -EINVAL;
-	} else {
-		cFYI(1, "missing %d bytes from transact2, check next response",
-			remaining);
-		if (total_data_size > maxBufSize) {
-			cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-				total_data_size, maxBufSize);
-			return -EINVAL;
+	if (!server->smallbuf) {
+		server->smallbuf = (char *)cifs_small_buf_get();
+		if (!server->smallbuf) {
+			cifs_dbg(VFS, "No memory for SMB response\n");
+			msleep(1000);
+			/* retry will check if exiting */
+			return false;
 		}
-		return remaining;
+		/* beginning of smb buffer is cleared in our buf_get */
+	} else {
+		/* if existing small buf clear beginning */
+		memset(server->smallbuf, 0, HEADER_SIZE(server));
 	}
+
+	return true;
 }
 
-static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
+static bool
+server_unresponsive(struct TCP_Server_Info *server)
 {
-	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
-	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-	char *data_area_of_target;
-	char *data_area_of_buf2;
-	int remaining;
-	__u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
-
-	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-
-	if (total_data_size !=
-	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
-		cFYI(1, "total data size of primary and secondary t2 differ");
-
-	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
-
-	remaining = total_data_size - total_in_buf;
-
-	if (remaining < 0)
-		return -EINVAL;
-
-	if (remaining == 0) /* nothing to do, ignore */
-		return 0;
-
-	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
-	if (remaining < total_in_buf2) {
-		cFYI(1, "transact2 2nd response contains too much data");
+	/*
+	 * We need to wait 2 echo intervals to make sure we handle such
+	 * situations right:
+	 * 1s  client sends a normal SMB request
+	 * 2s  client gets a response
+	 * 30s echo workqueue job pops, and decides we got a response recently
+	 *     and don't need to send another
+	 * ...
+	 * 65s kernel_recvmsg times out, and we see that we haven't gotten
+	 *     a response in >60s.
+	 */
+	if (server->tcpStatus == CifsGood &&
+	    time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
+		cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
+			 server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return true;
 	}
 
-	/* find end of first SMB data area */
-	data_area_of_target = (char *)&pSMBt->hdr.Protocol +
-				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
-	/* validate target area */
-
-	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
-				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
-
-	data_area_of_target += total_in_buf;
-
-	/* copy second buffer into end of first buffer */
-	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
-	total_in_buf += total_in_buf2;
-	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
-	byte_count = get_bcc_le(pTargetSMB);
-	byte_count += total_in_buf2;
-	put_bcc_le(byte_count, pTargetSMB);
-
-	byte_count = pTargetSMB->smb_buf_length;
-	byte_count += total_in_buf2;
+	return false;
+}
 
-	/* BB also add check that we are not beyond maximum buffer size */
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new:	pointer to memory for cloned array
+ * @iov:	pointer to original array
+ * @nr_segs:	number of members in original array
+ * @bytes:	number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+		size_t bytes)
+{
+	size_t base = 0;
 
-	pTargetSMB->smb_buf_length = byte_count;
+	while (bytes || !iov->iov_len) {
+		int copy = min(bytes, iov->iov_len);
 
-	if (remaining == total_in_buf2) {
-		cFYI(1, "found the last secondary response");
-		return 0; /* we are done */
-	} else /* more responses to go */
-		return 1;
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			nr_segs--;
+			base = 0;
+		}
+	}
+	memcpy(new, iov, sizeof(*iov) * nr_segs);
+	new->iov_base += base;
+	new->iov_len -= base;
+	return nr_segs;
 }
 
-static void
-cifs_echo_request(struct work_struct *work)
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
 {
-	int rc;
-	struct TCP_Server_Info *server = container_of(work,
-					struct TCP_Server_Info, echo.work);
+	struct kvec *new_iov;
 
-	/*
-	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
-	 * done, which is indicated by maxBuf != 0. Also, no need to ping if
-	 * we got a response recently
-	 */
-	if (server->maxBuf == 0 ||
-	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
-		goto requeue_echo;
-
-	rc = CIFSSMBEcho(server);
-	if (rc)
-		cFYI(1, "Unable to send echo request to server: %s",
-			server->hostname);
+	if (server->iov && nr_segs <= server->nr_iov)
+		return server->iov;
 
-requeue_echo:
-	queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
+	/* not big enough -- allocate a new one and release the old */
+	new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+	if (new_iov) {
+		kfree(server->iov);
+		server->iov = new_iov;
+		server->nr_iov = nr_segs;
+	}
+	return new_iov;
 }
 
-static int
-cifs_demultiplex_thread(struct TCP_Server_Info *server)
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+		       unsigned int nr_segs, unsigned int to_read)
 {
-	int length;
-	unsigned int pdu_length, total_read;
-	struct smb_hdr *smb_buffer = NULL;
-	struct smb_hdr *bigbuf = NULL;
-	struct smb_hdr *smallbuf = NULL;
+	int length = 0;
+	int total_read;
+	unsigned int segs;
 	struct msghdr smb_msg;
-	struct kvec iov;
-	struct socket *csocket = server->ssocket;
-	struct list_head *tmp, *tmp2;
-	struct task_struct *task_to_wake = NULL;
-	struct mid_q_entry *mid_entry;
-	char temp;
-	bool isLargeBuf = false;
-	bool isMultiRsp;
-	int reconnect;
+	struct kvec *iov;
 
-	current->flags |= PF_MEMALLOC;
-	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
-
-	length = atomic_inc_return(&tcpSesAllocCount);
-	if (length > 1)
-		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
-				GFP_KERNEL);
+	iov = get_server_iovec(server, nr_segs);
+	if (!iov)
+		return -ENOMEM;
 
-	set_freezable();
-	while (server->tcpStatus != CifsExiting) {
-		if (try_to_freeze())
-			continue;
-		if (bigbuf == NULL) {
-			bigbuf = cifs_buf_get();
-			if (!bigbuf) {
-				cERROR(1, "No memory for large SMB response");
-				msleep(3000);
-				/* retry will check if exiting */
-				continue;
-			}
-		} else if (isLargeBuf) {
-			/* we are reusing a dirty large buf, clear its start */
-			memset(bigbuf, 0, sizeof(struct smb_hdr));
-		}
+	smb_msg.msg_control = NULL;
+	smb_msg.msg_controllen = 0;
 
-		if (smallbuf == NULL) {
-			smallbuf = cifs_small_buf_get();
-			if (!smallbuf) {
-				cERROR(1, "No memory for SMB response");
-				msleep(1000);
-				/* retry will check if exiting */
-				continue;
-			}
-			/* beginning of smb buffer is cleared in our buf_get */
-		} else /* if existing small buf clear beginning */
-			memset(smallbuf, 0, sizeof(struct smb_hdr));
-
-		isLargeBuf = false;
-		isMultiRsp = false;
-		smb_buffer = smallbuf;
-		iov.iov_base = smb_buffer;
-		iov.iov_len = 4;
-		smb_msg.msg_control = NULL;
-		smb_msg.msg_controllen = 0;
-		pdu_length = 4; /* enough to get RFC1001 header */
+	for (total_read = 0; to_read; total_read += length, to_read -= length) {
+		try_to_freeze();
 
-incomplete_rcv:
-		if (echo_retries > 0 &&
-		    time_after(jiffies, server->lstrp +
-					(echo_retries * SMB_ECHO_INTERVAL))) {
-			cERROR(1, "Server %s has not responded in %d seconds. "
-				  "Reconnecting...", server->hostname,
-				  (echo_retries * SMB_ECHO_INTERVAL / HZ));
-			cifs_reconnect(server);
-			csocket = server->ssocket;
-			wake_up(&server->response_q);
-			continue;
+		if (server_unresponsive(server)) {
+			total_read = -EAGAIN;
+			break;
 		}
 
-		length =
-		    kernel_recvmsg(csocket, &smb_msg,
-				&iov, 1, pdu_length, 0 /* BB other flags? */);
+		segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+		length = kernel_recvmsg(server->ssocket, &smb_msg,
+					iov, segs, to_read, 0);
 
 		if (server->tcpStatus == CifsExiting) {
+			total_read = -ESHUTDOWN;
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
-			cFYI(1, "Reconnect after server stopped responding");
 			cifs_reconnect(server);
-			cFYI(1, "call to reconnect done");
-			csocket = server->ssocket;
-			continue;
+			total_read = -EAGAIN;
+			break;
 		} else if (length == -ERESTARTSYS ||
 			   length == -EAGAIN ||
 			   length == -EINTR) {
-			msleep(1); /* minimum sleep to prevent looping
-				allowing socket to clear and app threads to set
-				tcpStatus CifsNeedReconnect if server hung */
-			if (pdu_length < 4) {
-				iov.iov_base = (4 - pdu_length) +
-							(char *)smb_buffer;
-				iov.iov_len = pdu_length;
-				smb_msg.msg_control = NULL;
-				smb_msg.msg_controllen = 0;
-				goto incomplete_rcv;
-			} else
-				continue;
-		} else if (length <= 0) {
-			cFYI(1, "Reconnect after unexpected peek error %d",
-				length);
-			cifs_reconnect(server);
-			csocket = server->ssocket;
-			wake_up(&server->response_q);
-			continue;
-		} else if (length < pdu_length) {
-			cFYI(1, "requested %d bytes but only got %d bytes",
-				  pdu_length, length);
-			pdu_length -= length;
-			msleep(1);
-			goto incomplete_rcv;
-		}
-
-		/* The right amount was read from socket - 4 bytes */
-		/* so we can now interpret the length field */
-
-		/* the first byte big endian of the length field,
-		is actually not part of the length but the type
-		with the most common, zero, as regular data */
-		temp = *((char *) smb_buffer);
-
-		/* Note that FC 1001 length is big endian on the wire,
-		but we convert it here so it is always manipulated
-		as host byte order */
-		pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
-		smb_buffer->smb_buf_length = pdu_length;
-
-		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
-
-		if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
-			continue;
-		} else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-			cFYI(1, "Good RFC 1002 session rsp");
-			continue;
-		} else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
-			/* we get this from Windows 98 instead of
-			   an error on SMB negprot response */
-			cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
-				pdu_length);
-			/* give server a second to clean up  */
-			msleep(1000);
-			/* always try 445 first on reconnect since we get NACK
-			 * on some if we ever connected to port 139 (the NACK
-			 * is since we do not begin with RFC1001 session
-			 * initialize frame)
+			/*
+			 * Minimum sleep to prevent looping, allowing socket
+			 * to clear and app threads to set tcpStatus
+			 * CifsNeedReconnect if server hung.
 			 */
-			cifs_set_port((struct sockaddr *)
-					&server->dstaddr, CIFS_PORT);
-			cifs_reconnect(server);
-			csocket = server->ssocket;
-			wake_up(&server->response_q);
-			continue;
-		} else if (temp != (char) 0) {
-			cERROR(1, "Unknown RFC 1002 frame");
-			cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
-				      length);
-			cifs_reconnect(server);
-			csocket = server->ssocket;
+			usleep_range(1000, 2000);
+			length = 0;
 			continue;
-		}
-
-		/* else we have an SMB response */
-		if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
-			    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-			cERROR(1, "Invalid size SMB length %d pdu_length %d",
-					length, pdu_length+4);
+		} else if (length <= 0) {
+			cifs_dbg(FYI, "Received no data or error: expecting %d\n"
+				 "got %d", to_read, length);
 			cifs_reconnect(server);
-			csocket = server->ssocket;
-			wake_up(&server->response_q);
-			continue;
+			total_read = -EAGAIN;
+			break;
 		}
+	}
+	return total_read;
+}
 
-		/* else length ok */
-		reconnect = 0;
-
-		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
-			isLargeBuf = true;
-			memcpy(bigbuf, smallbuf, 4);
-			smb_buffer = bigbuf;
-		}
-		length = 0;
-		iov.iov_base = 4 + (char *)smb_buffer;
-		iov.iov_len = pdu_length;
-		for (total_read = 0; total_read < pdu_length;
-		     total_read += length) {
-			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
-						pdu_length - total_read, 0);
-			if (server->tcpStatus == CifsExiting) {
-				/* then will exit */
-				reconnect = 2;
-				break;
-			} else if (server->tcpStatus == CifsNeedReconnect) {
-				cifs_reconnect(server);
-				csocket = server->ssocket;
-				/* Reconnect wakes up rspns q */
-				/* Now we will reread sock */
-				reconnect = 1;
-				break;
-			} else if (length == -ERESTARTSYS ||
-				   length == -EAGAIN ||
-				   length == -EINTR) {
-				msleep(1); /* minimum sleep to prevent looping,
-					      allowing socket to clear and app
-					      threads to set tcpStatus
-					      CifsNeedReconnect if server hung*/
-				length = 0;
-				continue;
-			} else if (length <= 0) {
-				cERROR(1, "Received no data, expecting %d",
-					      pdu_length - total_read);
-				cifs_reconnect(server);
-				csocket = server->ssocket;
-				reconnect = 1;
-				break;
-			}
-		}
-		if (reconnect == 2)
-			break;
-		else if (reconnect == 1)
-			continue;
+int
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+		      unsigned int to_read)
+{
+	struct kvec iov;
 
-		total_read += 4; /* account for rfc1002 hdr */
+	iov.iov_base = buf;
+	iov.iov_len = to_read;
 
-		dump_smb(smb_buffer, total_read);
+	return cifs_readv_from_socket(server, &iov, 1, to_read);
+}
 
+static bool
+is_smb_response(struct TCP_Server_Info *server, unsigned char type)
+{
+	/*
+	 * The first byte big endian of the length field,
+	 * is actually not part of the length but the type
+	 * with the most common, zero, as regular data.
+	 */
+	switch (type) {
+	case RFC1002_SESSION_MESSAGE:
+		/* Regular SMB response */
+		return true;
+	case RFC1002_SESSION_KEEP_ALIVE:
+		cifs_dbg(FYI, "RFC 1002 session keep alive\n");
+		break;
+	case RFC1002_POSITIVE_SESSION_RESPONSE:
+		cifs_dbg(FYI, "RFC 1002 positive session response\n");
+		break;
+	case RFC1002_NEGATIVE_SESSION_RESPONSE:
 		/*
-		 * We know that we received enough to get to the MID as we
-		 * checked the pdu_length earlier. Now check to see
-		 * if the rest of the header is OK. We borrow the length
-		 * var for the rest of the loop to avoid a new stack var.
-		 *
-		 * 48 bytes is enough to display the header and a little bit
-		 * into the payload for debugging purposes.
+		 * We get this from Windows 98 instead of an error on
+		 * SMB negprot response.
 		 */
-		length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
-		if (length != 0)
-			cifs_dump_mem("Bad SMB: ", smb_buffer,
-					min_t(unsigned int, total_read, 48));
-
-		mid_entry = NULL;
-		server->lstrp = jiffies;
+		cifs_dbg(FYI, "RFC 1002 negative session response\n");
+		/* give server a second to clean up */
+		msleep(1000);
+		/*
+		 * Always try 445 first on reconnect since we get NACK
+		 * on some if we ever connected to port 139 (the NACK
+		 * is since we do not begin with RFC1001 session
+		 * initialize frame).
+		 */
+		cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		break;
+	default:
+		cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
+		cifs_reconnect(server);
+	}
 
-		spin_lock(&GlobalMid_Lock);
-		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+	return false;
+}
 
-			if ((mid_entry->mid == smb_buffer->Mid) &&
-			    (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
-			    (mid_entry->command == smb_buffer->Command)) {
-				if (length == 0 &&
-				   check2ndT2(smb_buffer, server->maxBuf) > 0) {
-					/* We have a multipart transact2 resp */
-					isMultiRsp = true;
-					if (mid_entry->resp_buf) {
-						/* merge response - fix up 1st*/
-						if (coalesce_t2(smb_buffer,
-							mid_entry->resp_buf)) {
-							mid_entry->multiRsp =
-								 true;
-							break;
-						} else {
-							/* all parts received */
-							mid_entry->multiEnd =
-								 true;
-							goto multi_t2_fnd;
-						}
-					} else {
-						if (!isLargeBuf) {
-							cERROR(1, "1st trans2 resp needs bigbuf");
-					/* BB maybe we can fix this up,  switch
-					   to already allocated large buffer? */
-						} else {
-							/* Have first buffer */
-							mid_entry->resp_buf =
-								 smb_buffer;
-							mid_entry->largeBuf =
-								 true;
-							bigbuf = NULL;
-						}
-					}
-					break;
-				}
-				mid_entry->resp_buf = smb_buffer;
-				mid_entry->largeBuf = isLargeBuf;
-multi_t2_fnd:
-				if (length == 0)
-					mid_entry->midState =
-							MID_RESPONSE_RECEIVED;
-				else
-					mid_entry->midState =
-							MID_RESPONSE_MALFORMED;
+void
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
+{
 #ifdef CONFIG_CIFS_STATS2
-				mid_entry->when_received = jiffies;
+	mid->when_received = jiffies;
 #endif
-				list_del_init(&mid_entry->qhead);
-				mid_entry->callback(mid_entry);
-				break;
-			}
-			mid_entry = NULL;
-		}
-		spin_unlock(&GlobalMid_Lock);
+	spin_lock(&GlobalMid_Lock);
+	if (!malformed)
+		mid->mid_state = MID_RESPONSE_RECEIVED;
+	else
+		mid->mid_state = MID_RESPONSE_MALFORMED;
+	list_del_init(&mid->qhead);
+	spin_unlock(&GlobalMid_Lock);
+}
 
-		if (mid_entry != NULL) {
-			/* Was previous buf put in mpx struct for multi-rsp? */
-			if (!isMultiRsp) {
-				/* smb buffer will be freed by user thread */
-				if (isLargeBuf)
-					bigbuf = NULL;
-				else
-					smallbuf = NULL;
-			}
-		} else if (length != 0) {
-			/* response sanity checks failed */
-			continue;
-		} else if (!is_valid_oplock_break(smb_buffer, server) &&
-			   !isMultiRsp) {
-			cERROR(1, "No task to wake, unknown frame received! "
-				   "NumMids %d", atomic_read(&midCount));
-			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
-				      sizeof(struct smb_hdr));
-#ifdef CONFIG_CIFS_DEBUG2
-			cifs_dump_detail(smb_buffer);
-			cifs_dump_mids(server);
-#endif /* CIFS_DEBUG2 */
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+	   char *buf, int malformed)
+{
+	if (server->ops->check_trans2 &&
+	    server->ops->check_trans2(mid, server, buf, malformed))
+		return;
+	mid->resp_buf = buf;
+	mid->large_buf = server->large_buf;
+	/* Was previous buf put in mpx struct for multi-rsp? */
+	if (!mid->multiRsp) {
+		/* smb buffer will be freed by user thread */
+		if (server->large_buf)
+			server->bigbuf = NULL;
+		else
+			server->smallbuf = NULL;
+	}
+	dequeue_mid(mid, malformed);
+}
 
-		}
-	} /* end while !EXITING */
+static void clean_demultiplex_info(struct TCP_Server_Info *server)
+{
+	int length;
 
 	/* take it off the list, if it's not already */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -701,67 +701,235 @@ multi_t2_fnd:
 	wake_up_all(&server->response_q);
 
 	/* check if we have blocked requests that need to free */
-	/* Note that cifs_max_pending is normally 50, but
-	can be set at module install time to as little as two */
-	spin_lock(&GlobalMid_Lock);
-	if (atomic_read(&server->inFlight) >= cifs_max_pending)
-		atomic_set(&server->inFlight, cifs_max_pending - 1);
-	/* We do not want to set the max_pending too low or we
-	could end up with the counter going negative */
-	spin_unlock(&GlobalMid_Lock);
-	/* Although there should not be any requests blocked on
-	this queue it can not hurt to be paranoid and try to wake up requests
-	that may haven been blocked when more than 50 at time were on the wire
-	to the same server - they now will see the session is in exit state
-	and get out of SendReceive.  */
+	spin_lock(&server->req_lock);
+	if (server->credits <= 0)
+		server->credits = 1;
+	spin_unlock(&server->req_lock);
+	/*
+	 * Although there should not be any requests blocked on this queue it
+	 * can not hurt to be paranoid and try to wake up requests that may
+	 * haven been blocked when more than 50 at time were on the wire to the
+	 * same server - they now will see the session is in exit state and get
+	 * out of SendReceive.
+	 */
 	wake_up_all(&server->request_q);
 	/* give those requests time to exit */
 	msleep(125);
 
 	if (server->ssocket) {
-		sock_release(csocket);
+		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
-	/* buffer usuallly freed in free_mid - need to free it here on exit */
-	cifs_buf_release(bigbuf);
-	if (smallbuf) /* no sense logging a debug message if NULL */
-		cifs_small_buf_release(smallbuf);
 
 	if (!list_empty(&server->pending_mid_q)) {
+		struct list_head dispose_list;
+		struct mid_q_entry *mid_entry;
+		struct list_head *tmp, *tmp2;
+
+		INIT_LIST_HEAD(&dispose_list);
 		spin_lock(&GlobalMid_Lock);
 		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-			cFYI(1, "Clearing Mid 0x%x - issuing callback",
-					 mid_entry->mid);
+			cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+			mid_entry->mid_state = MID_SHUTDOWN;
+			list_move(&mid_entry->qhead, &dispose_list);
+		}
+		spin_unlock(&GlobalMid_Lock);
+
+		/* now walk dispose list and issue callbacks */
+		list_for_each_safe(tmp, tmp2, &dispose_list) {
+			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+			cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
 			list_del_init(&mid_entry->qhead);
 			mid_entry->callback(mid_entry);
 		}
-		spin_unlock(&GlobalMid_Lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
 
 	if (!list_empty(&server->pending_mid_q)) {
-		/* mpx threads have not exited yet give them
-		at least the smb send timeout time for long ops */
-		/* due to delays on oplock break requests, we need
-		to wait at least 45 seconds before giving up
-		on a request getting a response and going ahead
-		and killing cifsd */
-		cFYI(1, "Wait for exit from demultiplex thread");
+		/*
+		 * mpx threads have not exited yet give them at least the smb
+		 * send timeout time for long ops.
+		 *
+		 * Due to delays on oplock break requests, we need to wait at
+		 * least 45 seconds before giving up on a request getting a
+		 * response and going ahead and killing cifsd.
+		 */
+		cifs_dbg(FYI, "Wait for exit from demultiplex thread\n");
 		msleep(46000);
-		/* if threads still have not exited they are probably never
-		coming home not much else we can do but free the memory */
+		/*
+		 * If threads still have not exited they are probably never
+		 * coming home not much else we can do but free the memory.
+		 */
 	}
 
 	kfree(server->hostname);
-	task_to_wake = xchg(&server->tsk, NULL);
+	kfree(server->iov);
 	kfree(server);
 
 	length = atomic_dec_return(&tcpSesAllocCount);
-	if (length  > 0)
+	if (length > 0)
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
 				GFP_KERNEL);
+}
+
+static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length;
+	char *buf = server->smallbuf;
+	unsigned int pdu_length = get_rfc1002_length(buf);
+
+	/* make sure this will fit in a large buffer */
+	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
+		cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return -EAGAIN;
+	}
+
+	/* switch to large buffer if too big for a small one */
+	if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+		server->large_buf = true;
+		memcpy(server->bigbuf, buf, server->total_read);
+		buf = server->bigbuf;
+	}
+
+	/* now read the rest */
+	length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
+				pdu_length - HEADER_SIZE(server) + 1 + 4);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	dump_smb(buf, server->total_read);
+
+	/*
+	 * We know that we received enough to get to the MID as we
+	 * checked the pdu_length earlier. Now check to see
+	 * if the rest of the header is OK. We borrow the length
+	 * var for the rest of the loop to avoid a new stack var.
+	 *
+	 * 48 bytes is enough to display the header and a little bit
+	 * into the payload for debugging purposes.
+	 */
+	length = server->ops->check_message(buf, server->total_read);
+	if (length != 0)
+		cifs_dump_mem("Bad SMB: ", buf,
+			min_t(unsigned int, server->total_read, 48));
+
+	if (server->ops->is_status_pending &&
+	    server->ops->is_status_pending(buf, server, length))
+		return -1;
+
+	if (!mid)
+		return length;
+
+	handle_mid(mid, server, buf, length);
+	return 0;
+}
+
+static int
+cifs_demultiplex_thread(void *p)
+{
+	int length;
+	struct TCP_Server_Info *server = p;
+	unsigned int pdu_length;
+	char *buf = NULL;
+	struct task_struct *task_to_wake = NULL;
+	struct mid_q_entry *mid_entry;
+
+	current->flags |= PF_MEMALLOC;
+	cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
+
+	length = atomic_inc_return(&tcpSesAllocCount);
+	if (length > 1)
+		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+				GFP_KERNEL);
+
+	set_freezable();
+	while (server->tcpStatus != CifsExiting) {
+		if (try_to_freeze())
+			continue;
+
+		if (!allocate_buffers(server))
+			continue;
+
+		server->large_buf = false;
+		buf = server->smallbuf;
+		pdu_length = 4; /* enough to get RFC1001 header */
+
+		length = cifs_read_from_socket(server, buf, pdu_length);
+		if (length < 0)
+			continue;
+		server->total_read = length;
+
+		/*
+		 * The right amount was read from socket - 4 bytes,
+		 * so we can now interpret the length field.
+		 */
+		pdu_length = get_rfc1002_length(buf);
+
+		cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
+		if (!is_smb_response(server, buf[0]))
+			continue;
+
+		/* make sure we have enough to get to the MID */
+		if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
+			cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
+				 pdu_length);
+			cifs_reconnect(server);
+			wake_up(&server->response_q);
+			continue;
+		}
+
+		/* read down to the MID */
+		length = cifs_read_from_socket(server, buf + 4,
+					       HEADER_SIZE(server) - 1 - 4);
+		if (length < 0)
+			continue;
+		server->total_read += length;
+
+		mid_entry = server->ops->find_mid(server, buf);
+
+		if (!mid_entry || !mid_entry->receive)
+			length = standard_receive3(server, mid_entry);
+		else
+			length = mid_entry->receive(server, mid_entry);
+
+		if (length < 0)
+			continue;
+
+		if (server->large_buf)
+			buf = server->bigbuf;
+
+		server->lstrp = jiffies;
+		if (mid_entry != NULL) {
+			if (!mid_entry->multiRsp || mid_entry->multiEnd)
+				mid_entry->callback(mid_entry);
+		} else if (!server->ops->is_oplock_break ||
+			   !server->ops->is_oplock_break(buf, server)) {
+			cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
+				 atomic_read(&midCount));
+			cifs_dump_mem("Received Data is: ", buf,
+				      HEADER_SIZE(server));
+#ifdef CONFIG_CIFS_DEBUG2
+			if (server->ops->dump_detail)
+				server->ops->dump_detail(buf);
+			cifs_dump_mids(server);
+#endif /* CIFS_DEBUG2 */
+
+		}
+	} /* end while !EXITING */
+
+	/* buffer usually freed in free_mid - need to free it here on exit */
+	cifs_buf_release(server->bigbuf);
+	if (server->smallbuf) /* no sense logging a debug message if NULL */
+		cifs_small_buf_release(server->smallbuf);
+
+	task_to_wake = xchg(&server->tsk, NULL);
+	clean_demultiplex_info(server);
 
 	/* if server->tsk was NULL then wait for a signal before exiting */
 	if (!task_to_wake) {
@@ -804,22 +972,247 @@ extract_hostname(const char *unc)
 	return dst;
 }
 
+static int get_option_ul(substring_t args[], unsigned long *option)
+{
+	int rc;
+	char *string;
+
+	string = match_strdup(args);
+	if (string == NULL)
+		return -ENOMEM;
+	rc = kstrtoul(string, 0, option);
+	kfree(string);
+
+	return rc;
+}
+
+static int get_option_uid(substring_t args[], kuid_t *result)
+{
+	unsigned long value;
+	kuid_t uid;
+	int rc;
+
+	rc = get_option_ul(args, &value);
+	if (rc)
+		return rc;
+
+	uid = make_kuid(current_user_ns(), value);
+	if (!uid_valid(uid))
+		return -EINVAL;
+
+	*result = uid;
+	return 0;
+}
+
+static int get_option_gid(substring_t args[], kgid_t *result)
+{
+	unsigned long value;
+	kgid_t gid;
+	int rc;
+
+	rc = get_option_ul(args, &value);
+	if (rc)
+		return rc;
+
+	gid = make_kgid(current_user_ns(), value);
+	if (!gid_valid(gid))
+		return -EINVAL;
+
+	*result = gid;
+	return 0;
+}
+
+static int cifs_parse_security_flavors(char *value,
+				       struct smb_vol *vol)
+{
+
+	substring_t args[MAX_OPT_ARGS];
+
+	/*
+	 * With mount options, the last one should win. Reset any existing
+	 * settings back to default.
+	 */
+	vol->sectype = Unspecified;
+	vol->sign = false;
+
+	switch (match_token(value, cifs_secflavor_tokens, args)) {
+	case Opt_sec_krb5p:
+		cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+		return 1;
+	case Opt_sec_krb5i:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_krb5:
+		vol->sectype = Kerberos;
+		break;
+	case Opt_sec_ntlmsspi:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_ntlmssp:
+		vol->sectype = RawNTLMSSP;
+		break;
+	case Opt_sec_ntlmi:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_ntlm:
+		vol->sectype = NTLM;
+		break;
+	case Opt_sec_ntlmv2i:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_ntlmv2:
+		vol->sectype = NTLMv2;
+		break;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	case Opt_sec_lanman:
+		vol->sectype = LANMAN;
+		break;
+#endif
+	case Opt_sec_none:
+		vol->nullauth = 1;
+		break;
+	default:
+		cifs_dbg(VFS, "bad security option: %s\n", value);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int
+cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, cifs_cacheflavor_tokens, args)) {
+	case Opt_cache_loose:
+		vol->direct_io = false;
+		vol->strict_io = false;
+		break;
+	case Opt_cache_strict:
+		vol->direct_io = false;
+		vol->strict_io = true;
+		break;
+	case Opt_cache_none:
+		vol->direct_io = true;
+		vol->strict_io = false;
+		break;
+	default:
+		cifs_dbg(VFS, "bad cache= option: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+cifs_parse_smb_version(char *value, struct smb_vol *vol)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, cifs_smb_version_tokens, args)) {
+	case Smb_1:
+		vol->ops = &smb1_operations;
+		vol->vals = &smb1_values;
+		break;
+#ifdef CONFIG_CIFS_SMB2
+	case Smb_20:
+		vol->ops = &smb20_operations;
+		vol->vals = &smb20_values;
+		break;
+	case Smb_21:
+		vol->ops = &smb21_operations;
+		vol->vals = &smb21_values;
+		break;
+	case Smb_30:
+		vol->ops = &smb30_operations;
+		vol->vals = &smb30_values;
+		break;
+	case Smb_302:
+		vol->ops = &smb30_operations; /* currently identical with 3.0 */
+		vol->vals = &smb302_values;
+		break;
+#endif
+	default:
+		cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Parse a devname into substrings and populate the vol->UNC and vol->prepath
+ * fields with the result. Returns 0 on success and an error otherwise.
+ */
+static int
+cifs_parse_devname(const char *devname, struct smb_vol *vol)
+{
+	char *pos;
+	const char *delims = "/\\";
+	size_t len;
+
+	/* make sure we have a valid UNC double delimiter prefix */
+	len = strspn(devname, delims);
+	if (len != 2)
+		return -EINVAL;
+
+	/* find delimiter between host and sharename */
+	pos = strpbrk(devname + 2, delims);
+	if (!pos)
+		return -EINVAL;
+
+	/* skip past delimiter */
+	++pos;
+
+	/* now go until next delimiter or end of string */
+	len = strcspn(pos, delims);
+
+	/* move "pos" up to delimiter or NULL */
+	pos += len;
+	vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
+	if (!vol->UNC)
+		return -ENOMEM;
+
+	convert_delimiter(vol->UNC, '\\');
+
+	/* If pos is NULL, or is a bogus trailing delimiter then no prepath */
+	if (!*pos++ || !*pos)
+		return 0;
+
+	vol->prepath = kstrdup(pos, GFP_KERNEL);
+	if (!vol->prepath)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static int
-cifs_parse_mount_options(char *options, const char *devname,
+cifs_parse_mount_options(const char *mountdata, const char *devname,
 			 struct smb_vol *vol)
 {
-	char *value;
-	char *data;
+	char *data, *end;
+	char *mountdata_copy = NULL, *options;
 	unsigned int  temp_len, i, j;
 	char separator[2];
 	short int override_uid = -1;
 	short int override_gid = -1;
 	bool uid_specified = false;
 	bool gid_specified = false;
+	bool sloppy = false;
+	char *invalid = NULL;
 	char *nodename = utsname()->nodename;
+	char *string = NULL;
+	char *tmp_end, *value;
+	char delim;
+	bool got_ip = false;
+	unsigned short port = 0;
+	struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
 
 	separator[0] = ',';
 	separator[1] = 0;
+	delim = separator[0];
+
+	/* ensure we always start with zeroed-out smb_vol */
+	memset(vol, 0, sizeof(*vol));
 
 	/*
 	 * does not have to be perfect mapping since field is
@@ -847,581 +1240,635 @@ cifs_parse_mount_options(char *options, const char *devname,
 	/* default to using server inode numbers where available */
 	vol->server_ino = 1;
 
+	/* default is to use strict cifs caching semantics */
+	vol->strict_io = true;
+
 	vol->actimeo = CIFS_DEF_ACTIMEO;
 
-	if (!options)
-		return 1;
+	/* FIXME: add autonegotiation -- for now, SMB1 is default */
+	vol->ops = &smb1_operations;
+	vol->vals = &smb1_values;
+
+	if (!mountdata)
+		goto cifs_parse_mount_err;
+
+	mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL);
+	if (!mountdata_copy)
+		goto cifs_parse_mount_err;
+
+	options = mountdata_copy;
+	end = options + strlen(options);
 
 	if (strncmp(options, "sep=", 4) == 0) {
 		if (options[4] != 0) {
 			separator[0] = options[4];
 			options += 5;
 		} else {
-			cFYI(1, "Null separator not allowed");
+			cifs_dbg(FYI, "Null separator not allowed\n");
 		}
 	}
+	vol->backupuid_specified = false; /* no backup intent for a user */
+	vol->backupgid_specified = false; /* no backup intent for a group */
+
+	switch (cifs_parse_devname(devname, vol)) {
+	case 0:
+		break;
+	case -ENOMEM:
+		cifs_dbg(VFS, "Unable to allocate memory for devname.\n");
+		goto cifs_parse_mount_err;
+	case -EINVAL:
+		cifs_dbg(VFS, "Malformed UNC in devname.\n");
+		goto cifs_parse_mount_err;
+	default:
+		cifs_dbg(VFS, "Unknown error parsing devname.\n");
+		goto cifs_parse_mount_err;
+	}
 
 	while ((data = strsep(&options, separator)) != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		unsigned long option;
+		int token;
+
 		if (!*data)
 			continue;
-		if ((value = strchr(data, '=')) != NULL)
-			*value++ = '\0';
 
-		/* Have to parse this before we parse for "user" */
-		if (strnicmp(data, "user_xattr", 10) == 0) {
+		token = match_token(data, cifs_mount_option_tokens, args);
+
+		switch (token) {
+
+		/* Ingnore the following */
+		case Opt_ignore:
+			break;
+
+		/* Boolean values */
+		case Opt_user_xattr:
 			vol->no_xattr = 0;
-		} else if (strnicmp(data, "nouser_xattr", 12) == 0) {
+			break;
+		case Opt_nouser_xattr:
 			vol->no_xattr = 1;
-		} else if (strnicmp(data, "user", 4) == 0) {
-			if (!value) {
-				printk(KERN_WARNING
-				       "CIFS: invalid or missing username\n");
-				return 1;	/* needs_arg; */
-			} else if (!*value) {
-				/* null user, ie anonymous, authentication */
-				vol->nullauth = 1;
-			}
-			if (strnlen(value, 200) < 200) {
-				vol->username = value;
-			} else {
-				printk(KERN_WARNING "CIFS: username too long\n");
-				return 1;
-			}
-		} else if (strnicmp(data, "pass", 4) == 0) {
-			if (!value) {
-				vol->password = NULL;
-				continue;
-			} else if (value[0] == 0) {
-				/* check if string begins with double comma
-				   since that would mean the password really
-				   does start with a comma, and would not
-				   indicate an empty string */
-				if (value[1] != separator[0]) {
-					vol->password = NULL;
-					continue;
-				}
-			}
-			temp_len = strlen(value);
-			/* removed password length check, NTLM passwords
-				can be arbitrarily long */
-
-			/* if comma in password, the string will be
-			prematurely null terminated.  Commas in password are
-			specified across the cifs mount interface by a double
-			comma ie ,, and a comma used as in other cases ie ','
-			as a parameter delimiter/separator is single and due
-			to the strsep above is temporarily zeroed. */
-
-			/* NB: password legally can have multiple commas and
-			the only illegal character in a password is null */
-
-			if ((value[temp_len] == 0) &&
-			    (value[temp_len+1] == separator[0])) {
-				/* reinsert comma */
-				value[temp_len] = separator[0];
-				temp_len += 2;  /* move after second comma */
-				while (value[temp_len] != 0)  {
-					if (value[temp_len] == separator[0]) {
-						if (value[temp_len+1] ==
-						     separator[0]) {
-						/* skip second comma */
-							temp_len++;
-						} else {
-						/* single comma indicating start
-							 of next parm */
-							break;
-						}
-					}
-					temp_len++;
-				}
-				if (value[temp_len] == 0) {
-					options = NULL;
-				} else {
-					value[temp_len] = 0;
-					/* point option to start of next parm */
-					options = value + temp_len + 1;
-				}
-				/* go from value to value + temp_len condensing
-				double commas to singles. Note that this ends up
-				allocating a few bytes too many, which is ok */
-				vol->password = kzalloc(temp_len, GFP_KERNEL);
-				if (vol->password == NULL) {
-					printk(KERN_WARNING "CIFS: no memory "
-							    "for password\n");
-					return 1;
-				}
-				for (i = 0, j = 0; i < temp_len; i++, j++) {
-					vol->password[j] = value[i];
-					if (value[i] == separator[0]
-						&& value[i+1] == separator[0]) {
-						/* skip second comma */
-						i++;
-					}
-				}
-				vol->password[j] = 0;
-			} else {
-				vol->password = kzalloc(temp_len+1, GFP_KERNEL);
-				if (vol->password == NULL) {
-					printk(KERN_WARNING "CIFS: no memory "
-							    "for password\n");
-					return 1;
-				}
-				strcpy(vol->password, value);
-			}
-		} else if (!strnicmp(data, "ip", 2) ||
-			   !strnicmp(data, "addr", 4)) {
-			if (!value || !*value) {
-				vol->UNCip = NULL;
-			} else if (strnlen(value, INET6_ADDRSTRLEN) <
-							INET6_ADDRSTRLEN) {
-				vol->UNCip = value;
-			} else {
-				printk(KERN_WARNING "CIFS: ip address "
-						    "too long\n");
-				return 1;
-			}
-		} else if (strnicmp(data, "sec", 3) == 0) {
-			if (!value || !*value) {
-				cERROR(1, "no security value specified");
-				continue;
-			} else if (strnicmp(value, "krb5i", 5) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_KRB5 |
-					CIFSSEC_MUST_SIGN;
-			} else if (strnicmp(value, "krb5p", 5) == 0) {
-				/* vol->secFlg |= CIFSSEC_MUST_SEAL |
-					CIFSSEC_MAY_KRB5; */
-				cERROR(1, "Krb5 cifs privacy not supported");
-				return 1;
-			} else if (strnicmp(value, "krb5", 4) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_KRB5;
-			} else if (strnicmp(value, "ntlmsspi", 8) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
-					CIFSSEC_MUST_SIGN;
-			} else if (strnicmp(value, "ntlmssp", 7) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
-			} else if (strnicmp(value, "ntlmv2i", 7) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
-					CIFSSEC_MUST_SIGN;
-			} else if (strnicmp(value, "ntlmv2", 6) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_NTLMV2;
-			} else if (strnicmp(value, "ntlmi", 5) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_NTLM |
-					CIFSSEC_MUST_SIGN;
-			} else if (strnicmp(value, "ntlm", 4) == 0) {
-				/* ntlm is default so can be turned off too */
-				vol->secFlg |= CIFSSEC_MAY_NTLM;
-			} else if (strnicmp(value, "nontlm", 6) == 0) {
-				/* BB is there a better way to do this? */
-				vol->secFlg |= CIFSSEC_MAY_NTLMV2;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-			} else if (strnicmp(value, "lanman", 6) == 0) {
-				vol->secFlg |= CIFSSEC_MAY_LANMAN;
-#endif
-			} else if (strnicmp(value, "none", 4) == 0) {
-				vol->nullauth = 1;
-			} else {
-				cERROR(1, "bad security option: %s", value);
-				return 1;
-			}
-		} else if ((strnicmp(data, "unc", 3) == 0)
-			   || (strnicmp(data, "target", 6) == 0)
-			   || (strnicmp(data, "path", 4) == 0)) {
-			if (!value || !*value) {
-				printk(KERN_WARNING "CIFS: invalid path to "
-						    "network resource\n");
-				return 1;	/* needs_arg; */
-			}
-			if ((temp_len = strnlen(value, 300)) < 300) {
-				vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
-				if (vol->UNC == NULL)
-					return 1;
-				strcpy(vol->UNC, value);
-				if (strncmp(vol->UNC, "//", 2) == 0) {
-					vol->UNC[0] = '\\';
-					vol->UNC[1] = '\\';
-				} else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
-					printk(KERN_WARNING
-					       "CIFS: UNC Path does not begin "
-					       "with // or \\\\ \n");
-					return 1;
-				}
-			} else {
-				printk(KERN_WARNING "CIFS: UNC name too long\n");
-				return 1;
-			}
-		} else if ((strnicmp(data, "domain", 3) == 0)
-			   || (strnicmp(data, "workgroup", 5) == 0)) {
-			if (!value || !*value) {
-				printk(KERN_WARNING "CIFS: invalid domain name\n");
-				return 1;	/* needs_arg; */
-			}
-			/* BB are there cases in which a comma can be valid in
-			a domain name and need special handling? */
-			if (strnlen(value, 256) < 256) {
-				vol->domainname = value;
-				cFYI(1, "Domain name set");
-			} else {
-				printk(KERN_WARNING "CIFS: domain name too "
-						    "long\n");
-				return 1;
-			}
-		} else if (strnicmp(data, "srcaddr", 7) == 0) {
-			vol->srcaddr.ss_family = AF_UNSPEC;
-
-			if (!value || !*value) {
-				printk(KERN_WARNING "CIFS: srcaddr value"
-				       " not specified.\n");
-				return 1;	/* needs_arg; */
-			}
-			i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
-						 value, strlen(value));
-			if (i == 0) {
-				printk(KERN_WARNING "CIFS:  Could not parse"
-				       " srcaddr: %s\n",
-				       value);
-				return 1;
-			}
-		} else if (strnicmp(data, "prefixpath", 10) == 0) {
-			if (!value || !*value) {
-				printk(KERN_WARNING
-					"CIFS: invalid path prefix\n");
-				return 1;       /* needs_argument */
-			}
-			if ((temp_len = strnlen(value, 1024)) < 1024) {
-				if (value[0] != '/')
-					temp_len++;  /* missing leading slash */
-				vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
-				if (vol->prepath == NULL)
-					return 1;
-				if (value[0] != '/') {
-					vol->prepath[0] = '/';
-					strcpy(vol->prepath+1, value);
-				} else
-					strcpy(vol->prepath, value);
-				cFYI(1, "prefix path %s", vol->prepath);
-			} else {
-				printk(KERN_WARNING "CIFS: prefix too long\n");
-				return 1;
-			}
-		} else if (strnicmp(data, "iocharset", 9) == 0) {
-			if (!value || !*value) {
-				printk(KERN_WARNING "CIFS: invalid iocharset "
-						    "specified\n");
-				return 1;	/* needs_arg; */
-			}
-			if (strnlen(value, 65) < 65) {
-				if (strnicmp(value, "default", 7))
-					vol->iocharset = value;
-				/* if iocharset not set then load_nls_default
-				   is used by caller */
-				cFYI(1, "iocharset set to %s", value);
-			} else {
-				printk(KERN_WARNING "CIFS: iocharset name "
-						    "too long.\n");
-				return 1;
-			}
-		} else if (!strnicmp(data, "uid", 3) && value && *value) {
-			vol->linux_uid = simple_strtoul(value, &value, 0);
-			uid_specified = true;
-		} else if (!strnicmp(data, "cruid", 5) && value && *value) {
-			vol->cred_uid = simple_strtoul(value, &value, 0);
-		} else if (!strnicmp(data, "forceuid", 8)) {
+			break;
+		case Opt_forceuid:
 			override_uid = 1;
-		} else if (!strnicmp(data, "noforceuid", 10)) {
+			break;
+		case Opt_noforceuid:
 			override_uid = 0;
-		} else if (!strnicmp(data, "gid", 3) && value && *value) {
-			vol->linux_gid = simple_strtoul(value, &value, 0);
-			gid_specified = true;
-		} else if (!strnicmp(data, "forcegid", 8)) {
+			break;
+		case Opt_forcegid:
 			override_gid = 1;
-		} else if (!strnicmp(data, "noforcegid", 10)) {
+			break;
+		case Opt_noforcegid:
 			override_gid = 0;
-		} else if (strnicmp(data, "file_mode", 4) == 0) {
-			if (value && *value) {
-				vol->file_mode =
-					simple_strtoul(value, &value, 0);
-			}
-		} else if (strnicmp(data, "dir_mode", 4) == 0) {
-			if (value && *value) {
-				vol->dir_mode =
-					simple_strtoul(value, &value, 0);
-			}
-		} else if (strnicmp(data, "dirmode", 4) == 0) {
-			if (value && *value) {
-				vol->dir_mode =
-					simple_strtoul(value, &value, 0);
-			}
-		} else if (strnicmp(data, "port", 4) == 0) {
-			if (value && *value) {
-				vol->port =
-					simple_strtoul(value, &value, 0);
-			}
-		} else if (strnicmp(data, "rsize", 5) == 0) {
-			if (value && *value) {
-				vol->rsize =
-					simple_strtoul(value, &value, 0);
-			}
-		} else if (strnicmp(data, "wsize", 5) == 0) {
-			if (value && *value) {
-				vol->wsize =
-					simple_strtoul(value, &value, 0);
-			}
-		} else if (strnicmp(data, "sockopt", 5) == 0) {
-			if (!value || !*value) {
-				cERROR(1, "no socket option specified");
-				continue;
-			} else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
-				vol->sockopt_tcp_nodelay = 1;
-			}
-		} else if (strnicmp(data, "netbiosname", 4) == 0) {
-			if (!value || !*value || (*value == ' ')) {
-				cFYI(1, "invalid (empty) netbiosname");
-			} else {
-				memset(vol->source_rfc1001_name, 0x20,
-					RFC1001_NAME_LEN);
-				/*
-				 * FIXME: are there cases in which a comma can
-				 * be valid in workstation netbios name (and
-				 * need special handling)?
-				 */
-				for (i = 0; i < RFC1001_NAME_LEN; i++) {
-					/* don't ucase netbiosname for user */
-					if (value[i] == 0)
-						break;
-					vol->source_rfc1001_name[i] = value[i];
-				}
-				/* The string has 16th byte zero still from
-				set at top of the function  */
-				if (i == RFC1001_NAME_LEN && value[i] != 0)
-					printk(KERN_WARNING "CIFS: netbiosname"
-						" longer than 15 truncated.\n");
-			}
-		} else if (strnicmp(data, "servern", 7) == 0) {
-			/* servernetbiosname specified override *SMBSERVER */
-			if (!value || !*value || (*value == ' ')) {
-				cFYI(1, "empty server netbiosname specified");
-			} else {
-				/* last byte, type, is 0x20 for servr type */
-				memset(vol->target_rfc1001_name, 0x20,
-					RFC1001_NAME_LEN_WITH_NULL);
-
-				for (i = 0; i < 15; i++) {
-				/* BB are there cases in which a comma can be
-				   valid in this workstation netbios name
-				   (and need special handling)? */
-
-				/* user or mount helper must uppercase
-				   the netbiosname */
-					if (value[i] == 0)
-						break;
-					else
-						vol->target_rfc1001_name[i] =
-								value[i];
-				}
-				/* The string has 16th byte zero still from
-				   set at top of the function  */
-				if (i == RFC1001_NAME_LEN && value[i] != 0)
-					printk(KERN_WARNING "CIFS: server net"
-					"biosname longer than 15 truncated.\n");
-			}
-		} else if (strnicmp(data, "actimeo", 7) == 0) {
-			if (value && *value) {
-				vol->actimeo = HZ * simple_strtoul(value,
-								   &value, 0);
-				if (vol->actimeo > CIFS_MAX_ACTIMEO) {
-					cERROR(1, "CIFS: attribute cache"
-							"timeout too large");
-					return 1;
-				}
-			}
-		} else if (strnicmp(data, "credentials", 4) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "version", 3) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "guest", 5) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "rw", 2) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "ro", 2) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "noblocksend", 11) == 0) {
+			break;
+		case Opt_noblocksend:
 			vol->noblocksnd = 1;
-		} else if (strnicmp(data, "noautotune", 10) == 0) {
+			break;
+		case Opt_noautotune:
 			vol->noautotune = 1;
-		} else if ((strnicmp(data, "suid", 4) == 0) ||
-				   (strnicmp(data, "nosuid", 6) == 0) ||
-				   (strnicmp(data, "exec", 4) == 0) ||
-				   (strnicmp(data, "noexec", 6) == 0) ||
-				   (strnicmp(data, "nodev", 5) == 0) ||
-				   (strnicmp(data, "noauto", 6) == 0) ||
-				   (strnicmp(data, "dev", 3) == 0)) {
-			/*  The mount tool or mount.cifs helper (if present)
-			    uses these opts to set flags, and the flags are read
-			    by the kernel vfs layer before we get here (ie
-			    before read super) so there is no point trying to
-			    parse these options again and set anything and it
-			    is ok to just ignore them */
-			continue;
-		} else if (strnicmp(data, "hard", 4) == 0) {
+			break;
+		case Opt_hard:
 			vol->retry = 1;
-		} else if (strnicmp(data, "soft", 4) == 0) {
+			break;
+		case Opt_soft:
 			vol->retry = 0;
-		} else if (strnicmp(data, "perm", 4) == 0) {
+			break;
+		case Opt_perm:
 			vol->noperm = 0;
-		} else if (strnicmp(data, "noperm", 6) == 0) {
+			break;
+		case Opt_noperm:
 			vol->noperm = 1;
-		} else if (strnicmp(data, "mapchars", 8) == 0) {
+			break;
+		case Opt_mapchars:
 			vol->remap = 1;
-		} else if (strnicmp(data, "nomapchars", 10) == 0) {
+			break;
+		case Opt_nomapchars:
 			vol->remap = 0;
-		} else if (strnicmp(data, "sfu", 3) == 0) {
+			break;
+		case Opt_sfu:
 			vol->sfu_emul = 1;
-		} else if (strnicmp(data, "nosfu", 5) == 0) {
+			break;
+		case Opt_nosfu:
 			vol->sfu_emul = 0;
-		} else if (strnicmp(data, "nodfs", 5) == 0) {
+			break;
+		case Opt_nodfs:
 			vol->nodfs = 1;
-		} else if (strnicmp(data, "posixpaths", 10) == 0) {
+			break;
+		case Opt_posixpaths:
 			vol->posix_paths = 1;
-		} else if (strnicmp(data, "noposixpaths", 12) == 0) {
+			break;
+		case Opt_noposixpaths:
 			vol->posix_paths = 0;
-		} else if (strnicmp(data, "nounix", 6) == 0) {
-			vol->no_linux_ext = 1;
-		} else if (strnicmp(data, "nolinux", 7) == 0) {
+			break;
+		case Opt_nounix:
 			vol->no_linux_ext = 1;
-		} else if ((strnicmp(data, "nocase", 6) == 0) ||
-			   (strnicmp(data, "ignorecase", 10)  == 0)) {
+			break;
+		case Opt_nocase:
 			vol->nocase = 1;
-		} else if (strnicmp(data, "mand", 4) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "nomand", 6) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "_netdev", 7) == 0) {
-			/* ignore */
-		} else if (strnicmp(data, "brl", 3) == 0) {
+			break;
+		case Opt_brl:
 			vol->nobrl =  0;
-		} else if ((strnicmp(data, "nobrl", 5) == 0) ||
-			   (strnicmp(data, "nolock", 6) == 0)) {
+			break;
+		case Opt_nobrl:
 			vol->nobrl =  1;
-			/* turn off mandatory locking in mode
-			if remote locking is turned off since the
-			local vfs will do advisory */
+			/*
+			 * turn off mandatory locking in mode
+			 * if remote locking is turned off since the
+			 * local vfs will do advisory
+			 */
 			if (vol->file_mode ==
 				(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
 				vol->file_mode = S_IALLUGO;
-		} else if (strnicmp(data, "forcemandatorylock", 9) == 0) {
-			/* will take the shorter form "forcemand" as well */
-			/* This mount option will force use of mandatory
-			  (DOS/Windows style) byte range locks, instead of
-			  using posix advisory byte range locks, even if the
-			  Unix extensions are available and posix locks would
-			  be supported otherwise. If Unix extensions are not
-			  negotiated this has no effect since mandatory locks
-			  would be used (mandatory locks is all that those
-			  those servers support) */
+			break;
+		case Opt_forcemandatorylock:
 			vol->mand_lock = 1;
-		} else if (strnicmp(data, "setuids", 7) == 0) {
+			break;
+		case Opt_setuids:
 			vol->setuids = 1;
-		} else if (strnicmp(data, "nosetuids", 9) == 0) {
+			break;
+		case Opt_nosetuids:
 			vol->setuids = 0;
-		} else if (strnicmp(data, "dynperm", 7) == 0) {
+			break;
+		case Opt_dynperm:
 			vol->dynperm = true;
-		} else if (strnicmp(data, "nodynperm", 9) == 0) {
+			break;
+		case Opt_nodynperm:
 			vol->dynperm = false;
-		} else if (strnicmp(data, "nohard", 6) == 0) {
+			break;
+		case Opt_nohard:
 			vol->retry = 0;
-		} else if (strnicmp(data, "nosoft", 6) == 0) {
+			break;
+		case Opt_nosoft:
 			vol->retry = 1;
-		} else if (strnicmp(data, "nointr", 6) == 0) {
+			break;
+		case Opt_nointr:
 			vol->intr = 0;
-		} else if (strnicmp(data, "intr", 4) == 0) {
+			break;
+		case Opt_intr:
 			vol->intr = 1;
-		} else if (strnicmp(data, "nostrictsync", 12) == 0) {
+			break;
+		case Opt_nostrictsync:
 			vol->nostrictsync = 1;
-		} else if (strnicmp(data, "strictsync", 10) == 0) {
+			break;
+		case Opt_strictsync:
 			vol->nostrictsync = 0;
-		} else if (strnicmp(data, "serverino", 7) == 0) {
+			break;
+		case Opt_serverino:
 			vol->server_ino = 1;
-		} else if (strnicmp(data, "noserverino", 9) == 0) {
+			break;
+		case Opt_noserverino:
 			vol->server_ino = 0;
-		} else if (strnicmp(data, "cifsacl", 7) == 0) {
+			break;
+		case Opt_rwpidforward:
+			vol->rwpidforward = 1;
+			break;
+		case Opt_cifsacl:
 			vol->cifs_acl = 1;
-		} else if (strnicmp(data, "nocifsacl", 9) == 0) {
+			break;
+		case Opt_nocifsacl:
 			vol->cifs_acl = 0;
-		} else if (strnicmp(data, "acl", 3) == 0) {
+			break;
+		case Opt_acl:
 			vol->no_psx_acl = 0;
-		} else if (strnicmp(data, "noacl", 5) == 0) {
+			break;
+		case Opt_noacl:
 			vol->no_psx_acl = 1;
-		} else if (strnicmp(data, "locallease", 6) == 0) {
+			break;
+		case Opt_locallease:
 			vol->local_lease = 1;
-		} else if (strnicmp(data, "sign", 4) == 0) {
-			vol->secFlg |= CIFSSEC_MUST_SIGN;
-		} else if (strnicmp(data, "seal", 4) == 0) {
+			break;
+		case Opt_sign:
+			vol->sign = true;
+			break;
+		case Opt_seal:
 			/* we do not do the following in secFlags because seal
-			   is a per tree connection (mount) not a per socket
-			   or per-smb connection option in the protocol */
-			/* vol->secFlg |= CIFSSEC_MUST_SEAL; */
+			 * is a per tree connection (mount) not a per socket
+			 * or per-smb connection option in the protocol
+			 * vol->secFlg |= CIFSSEC_MUST_SEAL;
+			 */
 			vol->seal = 1;
-		} else if (strnicmp(data, "direct", 6) == 0) {
-			vol->direct_io = 1;
-		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
-			vol->direct_io = 1;
-		} else if (strnicmp(data, "strictcache", 11) == 0) {
-			vol->strict_io = 1;
-		} else if (strnicmp(data, "noac", 4) == 0) {
+			break;
+		case Opt_noac:
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
 				"/proc/fs/cifs/LookupCacheEnabled to 0\n");
-		} else if (strnicmp(data, "fsc", 3) == 0) {
+			break;
+		case Opt_fsc:
 #ifndef CONFIG_CIFS_FSCACHE
-			cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
-				  "kernel config option set");
-			return 1;
+			cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n");
+			goto cifs_parse_mount_err;
 #endif
 			vol->fsc = true;
-		} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+			break;
+		case Opt_mfsymlinks:
 			vol->mfsymlinks = true;
-		} else if (strnicmp(data, "multiuser", 8) == 0) {
+			break;
+		case Opt_multiuser:
 			vol->multiuser = true;
-		} else
-			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
-						data);
-	}
-	if (vol->UNC == NULL) {
-		if (devname == NULL) {
-			printk(KERN_WARNING "CIFS: Missing UNC name for mount "
-						"target\n");
-			return 1;
-		}
-		if ((temp_len = strnlen(devname, 300)) < 300) {
-			vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
-			if (vol->UNC == NULL)
-				return 1;
-			strcpy(vol->UNC, devname);
-			if (strncmp(vol->UNC, "//", 2) == 0) {
-				vol->UNC[0] = '\\';
-				vol->UNC[1] = '\\';
-			} else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
-				printk(KERN_WARNING "CIFS: UNC Path does not "
-						    "begin with // or \\\\ \n");
-				return 1;
+			break;
+		case Opt_sloppy:
+			sloppy = true;
+			break;
+		case Opt_nosharesock:
+			vol->nosharesock = true;
+			break;
+
+		/* Numeric Values */
+		case Opt_backupuid:
+			if (get_option_uid(args, &vol->backupuid)) {
+				cifs_dbg(VFS, "%s: Invalid backupuid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
 			}
-			value = strpbrk(vol->UNC+2, "/\\");
-			if (value)
-				*value = '\\';
-		} else {
-			printk(KERN_WARNING "CIFS: UNC name too long\n");
-			return 1;
+			vol->backupuid_specified = true;
+			break;
+		case Opt_backupgid:
+			if (get_option_gid(args, &vol->backupgid)) {
+				cifs_dbg(VFS, "%s: Invalid backupgid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->backupgid_specified = true;
+			break;
+		case Opt_uid:
+			if (get_option_uid(args, &vol->linux_uid)) {
+				cifs_dbg(VFS, "%s: Invalid uid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			uid_specified = true;
+			break;
+		case Opt_cruid:
+			if (get_option_uid(args, &vol->cred_uid)) {
+				cifs_dbg(VFS, "%s: Invalid cruid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			break;
+		case Opt_gid:
+			if (get_option_gid(args, &vol->linux_gid)) {
+				cifs_dbg(VFS, "%s: Invalid gid value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			gid_specified = true;
+			break;
+		case Opt_file_mode:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid file_mode value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->file_mode = option;
+			break;
+		case Opt_dirmode:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid dir_mode value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->dir_mode = option;
+			break;
+		case Opt_port:
+			if (get_option_ul(args, &option) ||
+			    option > USHRT_MAX) {
+				cifs_dbg(VFS, "%s: Invalid port value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			port = (unsigned short)option;
+			break;
+		case Opt_rsize:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid rsize value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->rsize = option;
+			break;
+		case Opt_wsize:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid wsize value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->wsize = option;
+			break;
+		case Opt_actimeo:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid actimeo value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->actimeo = HZ * option;
+			if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+				cifs_dbg(VFS, "attribute cache timeout too large\n");
+				goto cifs_parse_mount_err;
+			}
+			break;
+
+		/* String Arguments */
+
+		case Opt_blank_user:
+			/* null user, ie. anonymous authentication */
+			vol->nullauth = 1;
+			vol->username = NULL;
+			break;
+		case Opt_user:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnlen(string, CIFS_MAX_USERNAME_LEN) >
+							CIFS_MAX_USERNAME_LEN) {
+				printk(KERN_WARNING "CIFS: username too long\n");
+				goto cifs_parse_mount_err;
+			}
+			vol->username = kstrdup(string, GFP_KERNEL);
+			if (!vol->username)
+				goto cifs_parse_mount_err;
+			break;
+		case Opt_blank_pass:
+			/* passwords have to be handled differently
+			 * to allow the character used for deliminator
+			 * to be passed within them
+			 */
+
+			/*
+			 * Check if this is a case where the  password
+			 * starts with a delimiter
+			 */
+			tmp_end = strchr(data, '=');
+			tmp_end++;
+			if (!(tmp_end < end && tmp_end[1] == delim)) {
+				/* No it is not. Set the password to NULL */
+				vol->password = NULL;
+				break;
+			}
+			/* Yes it is. Drop down to Opt_pass below.*/
+		case Opt_pass:
+			/* Obtain the value string */
+			value = strchr(data, '=');
+			value++;
+
+			/* Set tmp_end to end of the string */
+			tmp_end = (char *) value + strlen(value);
+
+			/* Check if following character is the deliminator
+			 * If yes, we have encountered a double deliminator
+			 * reset the NULL character to the deliminator
+			 */
+			if (tmp_end < end && tmp_end[1] == delim) {
+				tmp_end[0] = delim;
+
+				/* Keep iterating until we get to a single
+				 * deliminator OR the end
+				 */
+				while ((tmp_end = strchr(tmp_end, delim))
+					!= NULL && (tmp_end[1] == delim)) {
+						tmp_end = (char *) &tmp_end[2];
+				}
+
+				/* Reset var options to point to next element */
+				if (tmp_end) {
+					tmp_end[0] = '\0';
+					options = (char *) &tmp_end[1];
+				} else
+					/* Reached the end of the mount option
+					 * string */
+					options = end;
+			}
+
+			/* Now build new password string */
+			temp_len = strlen(value);
+			vol->password = kzalloc(temp_len+1, GFP_KERNEL);
+			if (vol->password == NULL) {
+				printk(KERN_WARNING "CIFS: no memory "
+						    "for password\n");
+				goto cifs_parse_mount_err;
+			}
+
+			for (i = 0, j = 0; i < temp_len; i++, j++) {
+				vol->password[j] = value[i];
+				if ((value[i] == delim) &&
+				     value[i+1] == delim)
+					/* skip the second deliminator */
+					i++;
+			}
+			vol->password[j] = '\0';
+			break;
+		case Opt_blank_ip:
+			/* FIXME: should this be an error instead? */
+			got_ip = false;
+			break;
+		case Opt_ip:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (!cifs_convert_address(dstaddr, string,
+					strlen(string))) {
+				printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
+					string);
+				goto cifs_parse_mount_err;
+			}
+			got_ip = true;
+			break;
+		case Opt_domain:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
+					== CIFS_MAX_DOMAINNAME_LEN) {
+				printk(KERN_WARNING "CIFS: domain name too"
+						    " long\n");
+				goto cifs_parse_mount_err;
+			}
+
+			vol->domainname = kstrdup(string, GFP_KERNEL);
+			if (!vol->domainname) {
+				printk(KERN_WARNING "CIFS: no memory "
+						    "for domainname\n");
+				goto cifs_parse_mount_err;
+			}
+			cifs_dbg(FYI, "Domain name set\n");
+			break;
+		case Opt_srcaddr:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (!cifs_convert_address(
+					(struct sockaddr *)&vol->srcaddr,
+					string, strlen(string))) {
+				printk(KERN_WARNING "CIFS:  Could not parse"
+						    " srcaddr: %s\n", string);
+				goto cifs_parse_mount_err;
+			}
+			break;
+		case Opt_iocharset:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnlen(string, 1024) >= 65) {
+				printk(KERN_WARNING "CIFS: iocharset name "
+						    "too long.\n");
+				goto cifs_parse_mount_err;
+			}
+
+			 if (strnicmp(string, "default", 7) != 0) {
+				vol->iocharset = kstrdup(string,
+							 GFP_KERNEL);
+				if (!vol->iocharset) {
+					printk(KERN_WARNING "CIFS: no memory"
+							    "for charset\n");
+					goto cifs_parse_mount_err;
+				}
+			}
+			/* if iocharset not set then load_nls_default
+			 * is used by caller
+			 */
+			 cifs_dbg(FYI, "iocharset set to %s\n", string);
+			break;
+		case Opt_netbiosname:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			memset(vol->source_rfc1001_name, 0x20,
+				RFC1001_NAME_LEN);
+			/*
+			 * FIXME: are there cases in which a comma can
+			 * be valid in workstation netbios name (and
+			 * need special handling)?
+			 */
+			for (i = 0; i < RFC1001_NAME_LEN; i++) {
+				/* don't ucase netbiosname for user */
+				if (string[i] == 0)
+					break;
+				vol->source_rfc1001_name[i] = string[i];
+			}
+			/* The string has 16th byte zero still from
+			 * set at top of the function
+			 */
+			if (i == RFC1001_NAME_LEN && string[i] != 0)
+				printk(KERN_WARNING "CIFS: netbiosname"
+				       " longer than 15 truncated.\n");
+
+			break;
+		case Opt_servern:
+			/* servernetbiosname specified override *SMBSERVER */
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			/* last byte, type, is 0x20 for servr type */
+			memset(vol->target_rfc1001_name, 0x20,
+				RFC1001_NAME_LEN_WITH_NULL);
+
+			/* BB are there cases in which a comma can be
+			   valid in this workstation netbios name
+			   (and need special handling)? */
+
+			/* user or mount helper must uppercase the
+			   netbios name */
+			for (i = 0; i < 15; i++) {
+				if (string[i] == 0)
+					break;
+				vol->target_rfc1001_name[i] = string[i];
+			}
+			/* The string has 16th byte zero still from
+			   set at top of the function  */
+			if (i == RFC1001_NAME_LEN && string[i] != 0)
+				printk(KERN_WARNING "CIFS: server net"
+				       "biosname longer than 15 truncated.\n");
+			break;
+		case Opt_ver:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (strnicmp(string, "1", 1) == 0) {
+				/* This is the default */
+				break;
+			}
+			/* For all other value, error */
+			printk(KERN_WARNING "CIFS: Invalid version"
+					    " specified\n");
+			goto cifs_parse_mount_err;
+		case Opt_vers:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_smb_version(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
+		case Opt_sec:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_security_flavors(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
+		case Opt_cache:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_cache_flavor(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
+		default:
+			/*
+			 * An option we don't recognize. Save it off for later
+			 * if we haven't already found one
+			 */
+			if (!invalid)
+				invalid = data;
+			break;
 		}
+		/* Free up any allocated string */
+		kfree(string);
+		string = NULL;
 	}
 
-	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
-		cERROR(1, "Multiuser mounts currently require krb5 "
-			  "authentication!");
-		return 1;
+	if (!sloppy && invalid) {
+		printk(KERN_ERR "CIFS: Unknown mount option \"%s\"\n", invalid);
+		goto cifs_parse_mount_err;
+	}
+
+#ifndef CONFIG_KEYS
+	/* Muliuser mounts require CONFIG_KEYS support */
+	if (vol->multiuser) {
+		cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n");
+		goto cifs_parse_mount_err;
+	}
+#endif
+	if (!vol->UNC) {
+		cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
+		goto cifs_parse_mount_err;
 	}
 
-	if (vol->UNCip == NULL)
-		vol->UNCip = &vol->UNC[2];
+	/* make sure UNC has a share name */
+	if (!strchr(vol->UNC + 3, '\\')) {
+		cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n");
+		goto cifs_parse_mount_err;
+	}
+
+	if (!got_ip) {
+		/* No ip= option specified? Try to get it from UNC */
+		if (!cifs_convert_address(dstaddr, &vol->UNC[2],
+						strlen(&vol->UNC[2]))) {
+			printk(KERN_ERR "Unable to determine destination "
+					"address.\n");
+			goto cifs_parse_mount_err;
+		}
+	}
+
+	/* set the port that we got earlier */
+	cifs_set_port(dstaddr, port);
 
 	if (uid_specified)
 		vol->override_uid = override_uid;
@@ -1435,7 +1882,15 @@ cifs_parse_mount_options(char *options, const char *devname,
 		printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
 				   "specified with no gid= option.\n");
 
+	kfree(mountdata_copy);
 	return 0;
+
+out_nomem:
+	printk(KERN_WARNING "Could not allocate temporary buffer\n");
+cifs_parse_mount_err:
+	kfree(string);
+	kfree(mountdata_copy);
+	return 1;
 }
 
 /** Returns true if srcaddr isn't specified and rhs isn't
@@ -1455,7 +1910,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
 	}
 	case AF_INET6: {
 		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
-		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
 		return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
 	}
 	default:
@@ -1472,7 +1927,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
 static bool
 match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
 {
-	unsigned short int port, *sport;
+	__be16 port, *sport;
 
 	switch (addr->sa_family) {
 	case AF_INET:
@@ -1539,74 +1994,64 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
 static bool
 match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-	unsigned int secFlags;
-
-	if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-		secFlags = vol->secFlg;
-	else
-		secFlags = global_secflags | vol->secFlg;
-
-	switch (server->secType) {
-	case LANMAN:
-		if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
-			return false;
-		break;
-	case NTLMv2:
-		if (!(secFlags & CIFSSEC_MAY_NTLMV2))
-			return false;
-		break;
-	case NTLM:
-		if (!(secFlags & CIFSSEC_MAY_NTLM))
-			return false;
-		break;
-	case Kerberos:
-		if (!(secFlags & CIFSSEC_MAY_KRB5))
-			return false;
-		break;
-	case RawNTLMSSP:
-		if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
-			return false;
-		break;
-	default:
-		/* shouldn't happen */
+	/*
+	 * The select_sectype function should either return the vol->sectype
+	 * that was specified, or "Unspecified" if that sectype was not
+	 * compatible with the given NEGOTIATE request.
+	 */
+	if (select_sectype(server, vol->sectype) == Unspecified)
 		return false;
-	}
 
-	/* now check if signing mode is acceptible */
-	if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
-	    (server->secMode & SECMODE_SIGN_REQUIRED))
-			return false;
-	else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
-		 (server->secMode &
-		  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
-			return false;
+	/*
+	 * Now check if signing mode is acceptable. No need to check
+	 * global_secflags at this point since if MUST_SIGN is set then
+	 * the server->sign had better be too.
+	 */
+	if (vol->sign && !server->sign)
+		return false;
 
 	return true;
 }
 
+static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
+
+	if (vol->nosharesock)
+		return 0;
+
+	if ((server->vals != vol->vals) || (server->ops != vol->ops))
+		return 0;
+
+	if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+		return 0;
+
+	if (!match_address(server, addr,
+			   (struct sockaddr *)&vol->srcaddr))
+		return 0;
+
+	if (!match_port(server, addr))
+		return 0;
+
+	if (!match_security(server, vol))
+		return 0;
+
+	return 1;
+}
+
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
+cifs_find_tcp_session(struct smb_vol *vol)
 {
 	struct TCP_Server_Info *server;
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-		if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
-			continue;
-
-		if (!match_address(server, addr,
-				   (struct sockaddr *)&vol->srcaddr))
-			continue;
-
-		if (!match_port(server, addr))
-			continue;
-
-		if (!match_security(server, vol))
+		if (!match_server(server, vol))
 			continue;
 
 		++server->srv_count;
 		spin_unlock(&cifs_tcp_ses_lock);
-		cFYI(1, "Existing tcp session with server found");
+		cifs_dbg(FYI, "Existing tcp session with server found\n");
 		return server;
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
@@ -1651,40 +2096,12 @@ static struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb_vol *volume_info)
 {
 	struct TCP_Server_Info *tcp_ses = NULL;
-	struct sockaddr_storage addr;
-	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
-	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	int rc;
 
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-
-	cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
-
-	if (volume_info->UNCip && volume_info->UNC) {
-		rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
-					volume_info->UNCip,
-					strlen(volume_info->UNCip),
-					volume_info->port);
-		if (!rc) {
-			/* we failed translating address */
-			rc = -EINVAL;
-			goto out_err;
-		}
-	} else if (volume_info->UNCip) {
-		/* BB using ip addr as tcp_ses name to connect to the
-		   DFS root below */
-		cERROR(1, "Connecting to DFS root not implemented yet");
-		rc = -EINVAL;
-		goto out_err;
-	} else /* which tcp_sess DFS root would we conect to */ {
-		cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
-			"unc=//192.168.1.100/public) specified");
-		rc = -EINVAL;
-		goto out_err;
-	}
+	cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC);
 
 	/* see if we already have a matching tcp_ses */
-	tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
+	tcp_ses = cifs_find_tcp_session(volume_info);
 	if (tcp_ses)
 		return tcp_ses;
 
@@ -1694,12 +2111,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err;
 	}
 
-	rc = cifs_crypto_shash_allocate(tcp_ses);
-	if (rc) {
-		cERROR(1, "could not setup hash structures rc %d", rc);
-		goto out_err;
-	}
-
+	tcp_ses->ops = volume_info->ops;
+	tcp_ses->vals = volume_info->vals;
 	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
@@ -1710,7 +2123,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->noblocksnd = volume_info->noblocksnd;
 	tcp_ses->noautotune = volume_info->noautotune;
 	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
-	atomic_set(&tcp_ses->inFlight, 0);
+	tcp_ses->in_flight = 0;
+	tcp_ses->credits = 1;
 	init_waitqueue_head(&tcp_ses->response_q);
 	init_waitqueue_head(&tcp_ses->request_q);
 	INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
@@ -1722,33 +2136,28 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->session_estab = false;
 	tcp_ses->sequence_number = 0;
 	tcp_ses->lstrp = jiffies;
+	spin_lock_init(&tcp_ses->req_lock);
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
 	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
-
+	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+	       sizeof(tcp_ses->srcaddr));
+	memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
+		sizeof(tcp_ses->dstaddr));
+#ifdef CONFIG_CIFS_SMB2
+	get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE);
+#endif
 	/*
 	 * at this point we are the only ones with the pointer
 	 * to the struct since the kernel thread not created yet
 	 * no need to spinlock this init of tcpStatus or srv_count
 	 */
 	tcp_ses->tcpStatus = CifsNew;
-	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
-	       sizeof(tcp_ses->srcaddr));
 	++tcp_ses->srv_count;
 
-	if (addr.ss_family == AF_INET6) {
-		cFYI(1, "attempting ipv6 connect");
-		/* BB should we allow ipv6 on port 139? */
-		/* other OS never observed in Wild doing 139 with v6 */
-		memcpy(&tcp_ses->dstaddr, sin_server6,
-		       sizeof(struct sockaddr_in6));
-	} else
-		memcpy(&tcp_ses->dstaddr, sin_server,
-		       sizeof(struct sockaddr_in));
-
 	rc = ip_connect(tcp_ses);
 	if (rc < 0) {
-		cERROR(1, "Error connecting to socket. Aborting operation");
+		cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
 		goto out_err_crypto_release;
 	}
 
@@ -1757,14 +2166,15 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	 * this will succeed. No need for try_module_get().
 	 */
 	__module_get(THIS_MODULE);
-	tcp_ses->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread,
+	tcp_ses->tsk = kthread_run(cifs_demultiplex_thread,
 				  tcp_ses, "cifsd");
 	if (IS_ERR(tcp_ses->tsk)) {
 		rc = PTR_ERR(tcp_ses->tsk);
-		cERROR(1, "error %d create cifsd thread", rc);
+		cifs_dbg(VFS, "error %d create cifsd thread\n", rc);
 		module_put(THIS_MODULE);
 		goto out_err_crypto_release;
 	}
+	tcp_ses->tcpStatus = CifsNeedNegotiate;
 
 	/* thread spawned, put it on the list */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -1774,7 +2184,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	cifs_fscache_get_client_cookie(tcp_ses);
 
 	/* queue echo request delayed work */
-	queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+	queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
 
 	return tcp_ses;
 
@@ -1794,30 +2204,51 @@ out_err:
 	return ERR_PTR(rc);
 }
 
-static struct cifsSesInfo *
+static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
+{
+	if (vol->sectype != Unspecified &&
+	    vol->sectype != ses->sectype)
+		return 0;
+
+	switch (ses->sectype) {
+	case Kerberos:
+		if (!uid_eq(vol->cred_uid, ses->cred_uid))
+			return 0;
+		break;
+	default:
+		/* NULL username means anonymous session */
+		if (ses->user_name == NULL) {
+			if (!vol->nullauth)
+				return 0;
+			break;
+		}
+
+		/* anything else takes username/password */
+		if (strncmp(ses->user_name,
+			    vol->username ? vol->username : "",
+			    CIFS_MAX_USERNAME_LEN))
+			return 0;
+		if ((vol->username && strlen(vol->username) != 0) &&
+		    ses->password != NULL &&
+		    strncmp(ses->password,
+			    vol->password ? vol->password : "",
+			    CIFS_MAX_PASSWORD_LEN))
+			return 0;
+	}
+	return 1;
+}
+
+static struct cifs_ses *
 cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-	struct cifsSesInfo *ses;
+	struct cifs_ses *ses;
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
-		switch (server->secType) {
-		case Kerberos:
-			if (vol->cred_uid != ses->cred_uid)
-				continue;
-			break;
-		default:
-			/* anything else takes username/password */
-			if (strncmp(ses->userName, vol->username,
-				    MAX_USERNAME_SIZE))
-				continue;
-			if (strlen(vol->username) != 0 &&
-			    ses->password != NULL &&
-			    strncmp(ses->password,
-				    vol->password ? vol->password : "",
-				    MAX_PASSWORD_SIZE))
-				continue;
-		}
+		if (ses->status == CifsExiting)
+			continue;
+		if (!match_session(ses, vol))
+			continue;
 		++ses->ses_count;
 		spin_unlock(&cifs_tcp_ses_lock);
 		return ses;
@@ -1827,43 +2258,188 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 }
 
 static void
-cifs_put_smb_ses(struct cifsSesInfo *ses)
+cifs_put_smb_ses(struct cifs_ses *ses)
 {
-	int xid;
+	unsigned int rc, xid;
 	struct TCP_Server_Info *server = ses->server;
 
-	cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
+	cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
+
 	spin_lock(&cifs_tcp_ses_lock);
+	if (ses->status == CifsExiting) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
 	if (--ses->ses_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
+	if (ses->status == CifsGood)
+		ses->status = CifsExiting;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	if (ses->status == CifsExiting && server->ops->logoff) {
+		xid = get_xid();
+		rc = server->ops->logoff(xid, ses);
+		if (rc)
+			cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+				__func__, rc);
+		_free_xid(xid);
+	}
 
+	spin_lock(&cifs_tcp_ses_lock);
 	list_del_init(&ses->smb_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	if (ses->status == CifsGood) {
-		xid = GetXid();
-		CIFSSMBLogoff(xid, ses);
-		_FreeXid(xid);
-	}
 	sesInfoFree(ses);
 	cifs_put_tcp_session(server);
 }
 
-static struct cifsSesInfo *
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+	int rc = 0;
+	char *desc, *delim, *payload;
+	ssize_t len;
+	struct key *key;
+	struct TCP_Server_Info *server = ses->server;
+	struct sockaddr_in *sa;
+	struct sockaddr_in6 *sa6;
+	struct user_key_payload *upayload;
+
+	desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	/* try to find an address key first */
+	switch (server->dstaddr.ss_family) {
+	case AF_INET:
+		sa = (struct sockaddr_in *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+		break;
+	case AF_INET6:
+		sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+		sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+		break;
+	default:
+		cifs_dbg(FYI, "Bad ss_family (%hu)\n",
+			 server->dstaddr.ss_family);
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
+	key = request_key(&key_type_logon, desc, "");
+	if (IS_ERR(key)) {
+		if (!ses->domainName) {
+			cifs_dbg(FYI, "domainName is NULL\n");
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+
+		/* didn't work, try to find a domain key */
+		sprintf(desc, "cifs:d:%s", ses->domainName);
+		cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
+		key = request_key(&key_type_logon, desc, "");
+		if (IS_ERR(key)) {
+			rc = PTR_ERR(key);
+			goto out_err;
+		}
+	}
+
+	down_read(&key->sem);
+	upayload = key->payload.data;
+	if (IS_ERR_OR_NULL(upayload)) {
+		rc = upayload ? PTR_ERR(upayload) : -EINVAL;
+		goto out_key_put;
+	}
+
+	/* find first : in payload */
+	payload = (char *)upayload->data;
+	delim = strnchr(payload, upayload->datalen, ':');
+	cifs_dbg(FYI, "payload=%s\n", payload);
+	if (!delim) {
+		cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n",
+			 upayload->datalen);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	len = delim - payload;
+	if (len > CIFS_MAX_USERNAME_LEN || len <= 0) {
+		cifs_dbg(FYI, "Bad value from username search (len=%zd)\n",
+			 len);
+		rc = -EINVAL;
+		goto out_key_put;
+	}
+
+	vol->username = kstrndup(payload, len, GFP_KERNEL);
+	if (!vol->username) {
+		cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n",
+			 len);
+		rc = -ENOMEM;
+		goto out_key_put;
+	}
+	cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username);
+
+	len = key->datalen - (len + 1);
+	if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) {
+		cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len);
+		rc = -EINVAL;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+	++delim;
+	vol->password = kstrndup(delim, len, GFP_KERNEL);
+	if (!vol->password) {
+		cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
+			 len);
+		rc = -ENOMEM;
+		kfree(vol->username);
+		vol->username = NULL;
+		goto out_key_put;
+	}
+
+out_key_put:
+	up_read(&key->sem);
+	key_put(key);
+out_err:
+	kfree(desc);
+	cifs_dbg(FYI, "%s: returning %d\n", __func__, rc);
+	return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+		   struct cifs_ses *ses __attribute__((unused)))
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
+static struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
-	int rc = -ENOMEM, xid;
-	struct cifsSesInfo *ses;
+	int rc = -ENOMEM;
+	unsigned int xid;
+	struct cifs_ses *ses;
 	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
 	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	ses = cifs_find_smb_ses(server, volume_info);
 	if (ses) {
-		cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+		cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
+			 ses->status);
 
 		mutex_lock(&ses->session_mutex);
 		rc = cifs_negotiate_protocol(xid, ses);
@@ -1871,18 +2447,18 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 			mutex_unlock(&ses->session_mutex);
 			/* problem -- put our ses reference */
 			cifs_put_smb_ses(ses);
-			FreeXid(xid);
+			free_xid(xid);
 			return ERR_PTR(rc);
 		}
 		if (ses->need_reconnect) {
-			cFYI(1, "Session needs reconnect");
+			cifs_dbg(FYI, "Session needs reconnect\n");
 			rc = cifs_setup_session(xid, ses,
 						volume_info->local_nls);
 			if (rc) {
 				mutex_unlock(&ses->session_mutex);
 				/* problem -- put our reference */
 				cifs_put_smb_ses(ses);
-				FreeXid(xid);
+				free_xid(xid);
 				return ERR_PTR(rc);
 			}
 		}
@@ -1890,11 +2466,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 
 		/* existing SMB ses has a server reference already */
 		cifs_put_tcp_session(server);
-		FreeXid(xid);
+		free_xid(xid);
 		return ses;
 	}
 
-	cFYI(1, "Existing smb sess not found");
+	cifs_dbg(FYI, "Existing smb sess not found\n");
 	ses = sesInfoAlloc();
 	if (ses == NULL)
 		goto get_ses_fail;
@@ -1906,9 +2482,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	else
 		sprintf(ses->serverName, "%pI4", &addr->sin_addr);
 
-	if (volume_info->username)
-		strncpy(ses->userName, volume_info->username,
-			MAX_USERNAME_SIZE);
+	if (volume_info->username) {
+		ses->user_name = kstrdup(volume_info->username, GFP_KERNEL);
+		if (!ses->user_name)
+			goto get_ses_fail;
+	}
 
 	/* volume_info->password freed at unmount */
 	if (volume_info->password) {
@@ -1923,7 +2501,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	}
 	ses->cred_uid = volume_info->cred_uid;
 	ses->linux_uid = volume_info->linux_uid;
-	ses->overrideSecFlg = volume_info->secFlg;
+
+	ses->sectype = volume_info->sectype;
+	ses->sign = volume_info->sign;
 
 	mutex_lock(&ses->session_mutex);
 	rc = cifs_negotiate_protocol(xid, ses);
@@ -1938,29 +2518,35 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	list_add(&ses->smb_ses_list, &server->smb_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	FreeXid(xid);
+	free_xid(xid);
 	return ses;
 
 get_ses_fail:
 	sesInfoFree(ses);
-	FreeXid(xid);
+	free_xid(xid);
 	return ERR_PTR(rc);
 }
 
-static struct cifsTconInfo *
-cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+static int match_tcon(struct cifs_tcon *tcon, const char *unc)
+{
+	if (tcon->tidStatus == CifsExiting)
+		return 0;
+	if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+		return 0;
+	return 1;
+}
+
+static struct cifs_tcon *
+cifs_find_tcon(struct cifs_ses *ses, const char *unc)
 {
 	struct list_head *tmp;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &ses->tcon_list) {
-		tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
-		if (tcon->tidStatus == CifsExiting)
+		tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
+		if (!match_tcon(tcon, unc))
 			continue;
-		if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
-			continue;
-
 		++tcon->tc_count;
 		spin_unlock(&cifs_tcp_ses_lock);
 		return tcon;
@@ -1970,12 +2556,12 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 }
 
 static void
-cifs_put_tcon(struct cifsTconInfo *tcon)
+cifs_put_tcon(struct cifs_tcon *tcon)
 {
-	int xid;
-	struct cifsSesInfo *ses = tcon->ses;
+	unsigned int xid;
+	struct cifs_ses *ses = tcon->ses;
 
-	cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
+	cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
 	spin_lock(&cifs_tcp_ses_lock);
 	if (--tcon->tc_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
@@ -1985,32 +2571,37 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	list_del_init(&tcon->tcon_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	xid = GetXid();
-	CIFSSMBTDis(xid, tcon);
-	_FreeXid(xid);
+	xid = get_xid();
+	if (ses->server->ops->tree_disconnect)
+		ses->server->ops->tree_disconnect(xid, tcon);
+	_free_xid(xid);
 
 	cifs_fscache_release_super_cookie(tcon);
 	tconInfoFree(tcon);
 	cifs_put_smb_ses(ses);
 }
 
-static struct cifsTconInfo *
-cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+static struct cifs_tcon *
+cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 {
 	int rc, xid;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 
 	tcon = cifs_find_tcon(ses, volume_info->UNC);
 	if (tcon) {
-		cFYI(1, "Found match on UNC path");
+		cifs_dbg(FYI, "Found match on UNC path\n");
 		/* existing tcon already has a reference */
 		cifs_put_smb_ses(ses);
 		if (tcon->seal != volume_info->seal)
-			cERROR(1, "transport encryption setting "
-				   "conflicts with existing tid");
+			cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n");
 		return tcon;
 	}
 
+	if (!ses->server->ops->tree_connect) {
+		rc = -ENOSYS;
+		goto out_fail;
+	}
+
 	tcon = tconInfoAlloc();
 	if (tcon == NULL) {
 		rc = -ENOMEM;
@@ -2026,35 +2617,32 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
 		}
 	}
 
-	if (strchr(volume_info->UNC + 3, '\\') == NULL
-	    && strchr(volume_info->UNC + 3, '/') == NULL) {
-		cERROR(1, "Missing share name");
-		rc = -ENODEV;
-		goto out_fail;
-	}
-
-	/* BB Do we need to wrap session_mutex around
-	 * this TCon call and Unix SetFS as
-	 * we do on SessSetup and reconnect? */
-	xid = GetXid();
-	rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
-	FreeXid(xid);
-	cFYI(1, "CIFS Tcon rc = %d", rc);
+	/*
+	 * BB Do we need to wrap session_mutex around this TCon call and Unix
+	 * SetFS as we do on SessSetup and reconnect?
+	 */
+	xid = get_xid();
+	rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon,
+					    volume_info->local_nls);
+	free_xid(xid);
+	cifs_dbg(FYI, "Tcon rc = %d\n", rc);
 	if (rc)
 		goto out_fail;
 
 	if (volume_info->nodfs) {
 		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-		cFYI(1, "DFS disabled (%d)", tcon->Flags);
+		cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
 	}
 	tcon->seal = volume_info->seal;
-	/* we can have only one retry value for a connection
-	   to a share so for resources mounted more than once
-	   to the same server share the last value passed in
-	   for the retry flag is used */
+	/*
+	 * We can have only one retry value for a connection to a share so for
+	 * resources mounted more than once to the same server share the last
+	 * value passed in for the retry flag is used.
+	 */
 	tcon->retry = volume_info->retry;
 	tcon->nocase = volume_info->nocase;
 	tcon->local_lease = volume_info->local_lease;
+	INIT_LIST_HEAD(&tcon->pending_opens);
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&tcon->tcon_list, &ses->tcon_list);
@@ -2087,38 +2675,127 @@ cifs_put_tlink(struct tcon_link *tlink)
 	return;
 }
 
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+	return cifs_sb->master_tlink;
+}
+
+static int
+compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+	struct cifs_sb_info *old = CIFS_SB(sb);
+	struct cifs_sb_info *new = mnt_data->cifs_sb;
+
+	if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
+		return 0;
+
+	if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
+	    (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+		return 0;
+
+	/*
+	 * We want to share sb only if we don't specify an r/wsize or
+	 * specified r/wsize is greater than or equal to existing one.
+	 */
+	if (new->wsize && new->wsize < old->wsize)
+		return 0;
+
+	if (new->rsize && new->rsize < old->rsize)
+		return 0;
+
+	if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid))
+		return 0;
+
+	if (old->mnt_file_mode != new->mnt_file_mode ||
+	    old->mnt_dir_mode != new->mnt_dir_mode)
+		return 0;
+
+	if (strcmp(old->local_nls->charset, new->local_nls->charset))
+		return 0;
+
+	if (old->actimeo != new->actimeo)
+		return 0;
+
+	return 1;
+}
+
+int
+cifs_match_super(struct super_block *sb, void *data)
+{
+	struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+	struct smb_vol *volume_info;
+	struct cifs_sb_info *cifs_sb;
+	struct TCP_Server_Info *tcp_srv;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct tcon_link *tlink;
+	int rc = 0;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	cifs_sb = CIFS_SB(sb);
+	tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+	if (IS_ERR(tlink)) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return rc;
+	}
+	tcon = tlink_tcon(tlink);
+	ses = tcon->ses;
+	tcp_srv = ses->server;
+
+	volume_info = mnt_data->vol;
+
+	if (!match_server(tcp_srv, volume_info) ||
+	    !match_session(ses, volume_info) ||
+	    !match_tcon(tcon, volume_info->UNC)) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = compare_mount_options(sb, mnt_data);
+out:
+	spin_unlock(&cifs_tcp_ses_lock);
+	cifs_put_tlink(tlink);
+	return rc;
+}
+
 int
-get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
-	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
-	     struct dfs_info3_param **preferrals, int remap)
+get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
+	     const struct nls_table *nls_codepage, unsigned int *num_referrals,
+	     struct dfs_info3_param **referrals, int remap)
 {
 	char *temp_unc;
 	int rc = 0;
 
-	*pnum_referrals = 0;
-	*preferrals = NULL;
+	if (!ses->server->ops->tree_connect || !ses->server->ops->get_dfs_refer)
+		return -ENOSYS;
+
+	*num_referrals = 0;
+	*referrals = NULL;
 
-	if (pSesInfo->ipc_tid == 0) {
+	if (ses->ipc_tid == 0) {
 		temp_unc = kmalloc(2 /* for slashes */ +
-			strnlen(pSesInfo->serverName,
-				SERVER_NAME_LEN_WITH_NULL * 2)
-				 + 1 + 4 /* slash IPC$ */  + 2,
-				GFP_KERNEL);
+			strnlen(ses->serverName, SERVER_NAME_LEN_WITH_NULL * 2)
+				+ 1 + 4 /* slash IPC$ */ + 2, GFP_KERNEL);
 		if (temp_unc == NULL)
 			return -ENOMEM;
 		temp_unc[0] = '\\';
 		temp_unc[1] = '\\';
-		strcpy(temp_unc + 2, pSesInfo->serverName);
-		strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
-		rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
-		cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
+		strcpy(temp_unc + 2, ses->serverName);
+		strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$");
+		rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL,
+						    nls_codepage);
+		cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid);
 		kfree(temp_unc);
 	}
 	if (rc == 0)
-		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals,
-				     pnum_referrals, nls_codepage, remap);
-	/* BB map targetUNCs to dfs_info3 structures, here or
-		in CIFSGetDFSRefer BB */
+		rc = ses->server->ops->get_dfs_refer(xid, ses, old_path,
+						     referrals, num_referrals,
+						     nls_codepage, remap);
+	/*
+	 * BB - map targetUNCs to dfs_info3 structures, here or in
+	 * ses->server->ops->get_dfs_refer.
+	 */
 
 	return rc;
 }
@@ -2186,13 +2863,11 @@ bind_socket(struct TCP_Server_Info *server)
 			saddr4 = (struct sockaddr_in *)&server->srcaddr;
 			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
 			if (saddr6->sin6_family == AF_INET6)
-				cERROR(1, "cifs: "
-				       "Failed to bind to: %pI6c, error: %d\n",
-				       &saddr6->sin6_addr, rc);
+				cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n",
+					 &saddr6->sin6_addr, rc);
 			else
-				cERROR(1, "cifs: "
-				       "Failed to bind to: %pI4, error: %d\n",
-				       &saddr4->sin_addr.s_addr, rc);
+				cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n",
+					 &saddr4->sin_addr.s_addr, rc);
 		}
 	}
 	return rc;
@@ -2249,7 +2924,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
 		smb_buf = (struct smb_hdr *)ses_init_buf;
 
 		/* sizeof RFC1002_SESSION_REQUEST with no scope */
-		smb_buf->smb_buf_length = 0x81000044;
+		smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
 		rc = smb_send(server, smb_buf, 0x44);
 		kfree(ses_init_buf);
 		/*
@@ -2276,7 +2951,7 @@ static int
 generic_ip_connect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
-	unsigned short int sport;
+	__be16 sport;
 	int slen, sfamily;
 	struct socket *socket = server->ssocket;
 	struct sockaddr *saddr;
@@ -2297,13 +2972,13 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
 				   IPPROTO_TCP, &socket, 1);
 		if (rc < 0) {
-			cERROR(1, "Error %d creating socket", rc);
+			cifs_dbg(VFS, "Error %d creating socket\n", rc);
 			server->ssocket = NULL;
 			return rc;
 		}
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, "Socket created");
+		cifs_dbg(FYI, "Socket created\n");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
 		if (sfamily == AF_INET6)
@@ -2316,14 +2991,6 @@ generic_ip_connect(struct TCP_Server_Info *server)
 	if (rc < 0)
 		return rc;
 
-	rc = socket->ops->connect(socket, saddr, slen, 0);
-	if (rc < 0) {
-		cFYI(1, "Error %d connecting to server", rc);
-		sock_release(socket);
-		server->ssocket = NULL;
-		return rc;
-	}
-
 	/*
 	 * Eventually check for other socket options to change from
 	 * the default. sock_setsockopt not used because it expects
@@ -2345,13 +3012,22 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
 				(char *)&val, sizeof(val));
 		if (rc)
-			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+			cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n",
+				 rc);
 	}
 
-	 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+	cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n",
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
+	rc = socket->ops->connect(socket, saddr, slen, 0);
+	if (rc < 0) {
+		cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+		sock_release(socket);
+		server->ssocket = NULL;
+		return rc;
+	}
+
 	if (sport == htons(RFC1001_PORT))
 		rc = ip_rfc1001_connect(server);
 
@@ -2361,7 +3037,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
 static int
 ip_connect(struct TCP_Server_Info *server)
 {
-	unsigned short int *sport;
+	__be16 *sport;
 	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
 	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
 
@@ -2387,8 +3063,8 @@ ip_connect(struct TCP_Server_Info *server)
 	return generic_ip_connect(server);
 }
 
-void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
-			  struct super_block *sb, struct smb_vol *vol_info)
+void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
+			  struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info)
 {
 	/* if we are reconnecting then should we check to see if
 	 * any requested capabilities changed locally e.g. via
@@ -2404,19 +3080,19 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 	if (vol_info && vol_info->no_linux_ext) {
 		tcon->fsUnixInfo.Capability = 0;
 		tcon->unix_ext = 0; /* Unix Extensions disabled */
-		cFYI(1, "Linux protocol extensions disabled");
+		cifs_dbg(FYI, "Linux protocol extensions disabled\n");
 		return;
 	} else if (vol_info)
 		tcon->unix_ext = 1; /* Unix Extensions supported */
 
 	if (tcon->unix_ext == 0) {
-		cFYI(1, "Unix extensions disabled so not set on reconnect");
+		cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n");
 		return;
 	}
 
 	if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
 		__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
-
+		cifs_dbg(FYI, "unix caps which server supports %lld\n", cap);
 		/* check for reconnect case in which we do not
 		   want to change the mount behavior if we can avoid it */
 		if (vol_info == NULL) {
@@ -2426,155 +3102,91 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 				cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 			if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
 				if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-					cERROR(1, "POSIXPATH support change");
+					cifs_dbg(VFS, "POSIXPATH support change\n");
 				cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 			} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-				cERROR(1, "possible reconnect error");
-				cERROR(1, "server disabled POSIX path support");
+				cifs_dbg(VFS, "possible reconnect error\n");
+				cifs_dbg(VFS, "server disabled POSIX path support\n");
 			}
 		}
 
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+			cifs_dbg(VFS, "per-share encryption not supported yet\n");
+
 		cap &= CIFS_UNIX_CAP_MASK;
 		if (vol_info && vol_info->no_psx_acl)
 			cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 		else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-			cFYI(1, "negotiated posix acl support");
-			if (sb)
-				sb->s_flags |= MS_POSIXACL;
+			cifs_dbg(FYI, "negotiated posix acl support\n");
+			if (cifs_sb)
+				cifs_sb->mnt_cifs_flags |=
+					CIFS_MOUNT_POSIXACL;
 		}
 
 		if (vol_info && vol_info->posix_paths == 0)
 			cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 		else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-			cFYI(1, "negotiate posix pathnames");
-			if (sb)
-				CIFS_SB(sb)->mnt_cifs_flags |=
+			cifs_dbg(FYI, "negotiate posix pathnames\n");
+			if (cifs_sb)
+				cifs_sb->mnt_cifs_flags |=
 					CIFS_MOUNT_POSIX_PATHS;
 		}
 
-		/* We might be setting the path sep back to a different
-		form if we are reconnecting and the server switched its
-		posix path capability for this share */
-		if (sb && (CIFS_SB(sb)->prepathlen > 0))
-			CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
-
-		if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
-			if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
-				CIFS_SB(sb)->rsize = 127 * 1024;
-				cFYI(DBG2, "larger reads not supported by srv");
-			}
-		}
-
-
-		cFYI(1, "Negotiate caps 0x%x", (int)cap);
+		cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
 		if (cap & CIFS_UNIX_FCNTL_CAP)
-			cFYI(1, "FCNTL cap");
+			cifs_dbg(FYI, "FCNTL cap\n");
 		if (cap & CIFS_UNIX_EXTATTR_CAP)
-			cFYI(1, "EXTATTR cap");
+			cifs_dbg(FYI, "EXTATTR cap\n");
 		if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-			cFYI(1, "POSIX path cap");
+			cifs_dbg(FYI, "POSIX path cap\n");
 		if (cap & CIFS_UNIX_XATTR_CAP)
-			cFYI(1, "XATTR cap");
+			cifs_dbg(FYI, "XATTR cap\n");
 		if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-			cFYI(1, "POSIX ACL cap");
+			cifs_dbg(FYI, "POSIX ACL cap\n");
 		if (cap & CIFS_UNIX_LARGE_READ_CAP)
-			cFYI(1, "very large read cap");
+			cifs_dbg(FYI, "very large read cap\n");
 		if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-			cFYI(1, "very large write cap");
+			cifs_dbg(FYI, "very large write cap\n");
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
+			cifs_dbg(FYI, "transport encryption cap\n");
+		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
+			cifs_dbg(FYI, "mandatory transport encryption cap\n");
 #endif /* CIFS_DEBUG2 */
 		if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
 			if (vol_info == NULL) {
-				cFYI(1, "resetting capabilities failed");
+				cifs_dbg(FYI, "resetting capabilities failed\n");
 			} else
-				cERROR(1, "Negotiating Unix capabilities "
-					   "with the server failed.  Consider "
-					   "mounting with the Unix Extensions\n"
-					   "disabled, if problems are found, "
-					   "by specifying the nounix mount "
-					   "option.");
+				cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n");
 
 		}
 	}
 }
 
-static void
-convert_delimiter(char *path, char delim)
+void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+			struct cifs_sb_info *cifs_sb)
 {
-	int i;
-	char old_delim;
-
-	if (path == NULL)
-		return;
-
-	if (delim == '/')
-		old_delim = '\\';
-	else
-		old_delim = '/';
+	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
 
-	for (i = 0; path[i] != '\0'; i++) {
-		if (path[i] == old_delim)
-			path[i] = delim;
-	}
-}
+	spin_lock_init(&cifs_sb->tlink_tree_lock);
+	cifs_sb->tlink_tree = RB_ROOT;
 
-static void setup_cifs_sb(struct smb_vol *pvolume_info,
-			  struct cifs_sb_info *cifs_sb)
-{
-	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+	/*
+	 * Temporarily set r/wsize for matching superblock. If we end up using
+	 * new sb then client will later negotiate it downward if needed.
+	 */
+	cifs_sb->rsize = pvolume_info->rsize;
+	cifs_sb->wsize = pvolume_info->wsize;
 
-	if (pvolume_info->rsize > CIFSMaxBufSize) {
-		cERROR(1, "rsize %d too large, using MaxBufSize",
-			pvolume_info->rsize);
-		cifs_sb->rsize = CIFSMaxBufSize;
-	} else if ((pvolume_info->rsize) &&
-			(pvolume_info->rsize <= CIFSMaxBufSize))
-		cifs_sb->rsize = pvolume_info->rsize;
-	else /* default */
-		cifs_sb->rsize = CIFSMaxBufSize;
-
-	if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-		cERROR(1, "wsize %d too large, using 4096 instead",
-			  pvolume_info->wsize);
-		cifs_sb->wsize = 4096;
-	} else if (pvolume_info->wsize)
-		cifs_sb->wsize = pvolume_info->wsize;
-	else
-		cifs_sb->wsize = min_t(const int,
-					PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-					127*1024);
-		/* old default of CIFSMaxBufSize was too small now
-		   that SMB Write2 can send multiple pages in kvec.
-		   RFC1001 does not describe what happens when frame
-		   bigger than 128K is sent so use that as max in
-		   conjunction with 52K kvec constraint on arch with 4K
-		   page size  */
-
-	if (cifs_sb->rsize < 2048) {
-		cifs_sb->rsize = 2048;
-		/* Windows ME may prefer this */
-		cFYI(1, "readsize set to minimum: 2048");
-	}
-	/* calculate prepath */
-	cifs_sb->prepath = pvolume_info->prepath;
-	if (cifs_sb->prepath) {
-		cifs_sb->prepathlen = strlen(cifs_sb->prepath);
-		/* we can not convert the / to \ in the path
-		separators in the prefixpath yet because we do not
-		know (until reset_cifs_unix_caps is called later)
-		whether POSIX PATH CAP is available. We normalize
-		the / to \ after reset_cifs_unix_caps is called */
-		pvolume_info->prepath = NULL;
-	} else
-		cifs_sb->prepathlen = 0;
 	cifs_sb->mnt_uid = pvolume_info->linux_uid;
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-	cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
-		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+	cifs_dbg(FYI, "file mode: 0x%hx  dir mode: 0x%hx\n",
+		 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
 
 	cifs_sb->actimeo = pvolume_info->actimeo;
+	cifs_sb->local_nls = pvolume_info->local_nls;
 
 	if (pvolume_info->noperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2594,8 +3206,18 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
 	if (pvolume_info->mand_lock)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
+	if (pvolume_info->rwpidforward)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
 	if (pvolume_info->cifs_acl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->backupuid_specified) {
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
+		cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+	}
+	if (pvolume_info->backupgid_specified) {
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
+		cifs_sb->mnt_backupgid = pvolume_info->backupgid;
+	}
 	if (pvolume_info->override_uid)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
 	if (pvolume_info->override_gid)
@@ -2610,142 +3232,152 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (pvolume_info->strict_io)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
 	if (pvolume_info->direct_io) {
-		cFYI(1, "mounting share using direct i/o");
+		cifs_dbg(FYI, "mounting share using direct i/o\n");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
 	if (pvolume_info->mfsymlinks) {
 		if (pvolume_info->sfu_emul) {
-			cERROR(1,  "mount option mfsymlinks ignored if sfu "
-				   "mount option is used");
+			cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n");
 		} else {
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
 		}
 	}
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-		cERROR(1, "mount option dynperm ignored if cifsacl "
-			   "mount option supported");
-}
-
-static int
-is_path_accessible(int xid, struct cifsTconInfo *tcon,
-		   struct cifs_sb_info *cifs_sb, const char *full_path)
-{
-	int rc;
-	FILE_ALL_INFO *pfile_info;
-
-	pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-	if (pfile_info == NULL)
-		return -ENOMEM;
-
-	rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
-			      0 /* not legacy */, cifs_sb->local_nls,
-			      cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-	kfree(pfile_info);
-	return rc;
+		cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
 }
 
 static void
-cleanup_volume_info(struct smb_vol **pvolume_info)
+cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
-	struct smb_vol *volume_info;
-
-	if (!pvolume_info || !*pvolume_info)
-		return;
-
-	volume_info = *pvolume_info;
+	kfree(volume_info->username);
 	kzfree(volume_info->password);
 	kfree(volume_info->UNC);
+	kfree(volume_info->domainname);
+	kfree(volume_info->iocharset);
 	kfree(volume_info->prepath);
+}
+
+void
+cifs_cleanup_volume_info(struct smb_vol *volume_info)
+{
+	if (!volume_info)
+		return;
+	cleanup_volume_info_contents(volume_info);
 	kfree(volume_info);
-	*pvolume_info = NULL;
-	return;
 }
 
+
 #ifdef CONFIG_CIFS_DFS_UPCALL
-/* build_path_to_root returns full path to root when
- * we do not have an exiting connection (tcon) */
+/*
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * exiting connection (tcon)
+ */
 static char *
-build_unc_path_to_root(const struct smb_vol *volume_info,
+build_unc_path_to_root(const struct smb_vol *vol,
 		const struct cifs_sb_info *cifs_sb)
 {
-	char *full_path;
+	char *full_path, *pos;
+	unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+	unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
 
-	int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
-	full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
+	full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	strncpy(full_path, volume_info->UNC, unc_len);
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
-		int i;
-		for (i = 0; i < unc_len; i++) {
-			if (full_path[i] == '\\')
-				full_path[i] = '/';
-		}
-	}
+	strncpy(full_path, vol->UNC, unc_len);
+	pos = full_path + unc_len;
 
-	if (cifs_sb->prepathlen)
-		strncpy(full_path + unc_len, cifs_sb->prepath,
-				cifs_sb->prepathlen);
+	if (pplen) {
+		*pos = CIFS_DIR_SEP(cifs_sb);
+		strncpy(pos + 1, vol->prepath, pplen);
+		pos += pplen;
+	}
 
-	full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
+	*pos = '\0'; /* add trailing null */
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+	cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
 	return full_path;
 }
-#endif
 
-int
-cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
-		char *mount_data_global, const char *devname)
+/*
+ * Perform a dfs referral query for a share and (optionally) prefix
+ *
+ * If a referral is found, cifs_sb->mountdata will be (re-)allocated
+ * to a string containing updated options for the submount.  Otherwise it
+ * will be left untouched.
+ *
+ * Returns the rc from get_dfs_path to the caller, which can be used to
+ * determine whether there were referrals.
+ */
+static int
+expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
+		    struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
+		    int check_prefix)
 {
 	int rc;
-	int xid;
-	struct smb_vol *volume_info;
-	struct cifsSesInfo *pSesInfo;
-	struct cifsTconInfo *tcon;
-	struct TCP_Server_Info *srvTcp;
-	char   *full_path;
-	char *mount_data = mount_data_global;
-	struct tcon_link *tlink;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals = 0;
-	int referral_walks_count = 0;
-try_mount_again:
-#endif
-	rc = 0;
-	tcon = NULL;
-	pSesInfo = NULL;
-	srvTcp = NULL;
-	full_path = NULL;
-	tlink = NULL;
+	struct dfs_info3_param *referrals = NULL;
+	char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
 
-	xid = GetXid();
+	full_path = build_unc_path_to_root(volume_info, cifs_sb);
+	if (IS_ERR(full_path))
+		return PTR_ERR(full_path);
 
-	volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
-	if (!volume_info) {
-		rc = -ENOMEM;
-		goto out;
-	}
+	/* For DFS paths, skip the first '\' of the UNC */
+	ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
 
-	if (cifs_parse_mount_options(mount_data, devname, volume_info)) {
-		rc = -EINVAL;
-		goto out;
+	rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
+			  &num_referrals, &referrals,
+			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (!rc && num_referrals > 0) {
+		char *fake_devname = NULL;
+
+		mdata = cifs_compose_mount_options(cifs_sb->mountdata,
+						   full_path + 1, referrals,
+						   &fake_devname);
+
+		free_dfs_info_array(referrals, num_referrals);
+
+		if (IS_ERR(mdata)) {
+			rc = PTR_ERR(mdata);
+			mdata = NULL;
+		} else {
+			cleanup_volume_info_contents(volume_info);
+			rc = cifs_setup_volume_info(volume_info, mdata,
+							fake_devname);
+		}
+		kfree(fake_devname);
+		kfree(cifs_sb->mountdata);
+		cifs_sb->mountdata = mdata;
 	}
+	kfree(full_path);
+	return rc;
+}
+#endif
+
+static int
+cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
+			const char *devname)
+{
+	int rc = 0;
+
+	if (cifs_parse_mount_options(mount_data, devname, volume_info))
+		return -EINVAL;
 
 	if (volume_info->nullauth) {
-		cFYI(1, "null user");
-		volume_info->username = "";
+		cifs_dbg(FYI, "Anonymous login\n");
+		kfree(volume_info->username);
+		volume_info->username = NULL;
 	} else if (volume_info->username) {
 		/* BB fixme parse for domain name here */
-		cFYI(1, "Username: %s", volume_info->username);
+		cifs_dbg(FYI, "Username: %s\n", volume_info->username);
 	} else {
-		cifserror("No username specified");
+		cifs_dbg(VFS, "No username specified\n");
 	/* In userspace mount helper we can get user name from alternate
 	   locations such as env variables and files on disk */
-		rc = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	/* this is needed for ASCII cp to Unicode converts */
@@ -2755,85 +3387,156 @@ try_mount_again:
 	} else {
 		volume_info->local_nls = load_nls(volume_info->iocharset);
 		if (volume_info->local_nls == NULL) {
-			cERROR(1, "CIFS mount error: iocharset %s not found",
+			cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
 				 volume_info->iocharset);
-			rc = -ELIBACC;
-			goto out;
+			return -ELIBACC;
 		}
 	}
-	cifs_sb->local_nls = volume_info->local_nls;
+
+	return rc;
+}
+
+struct smb_vol *
+cifs_get_volume_info(char *mount_data, const char *devname)
+{
+	int rc;
+	struct smb_vol *volume_info;
+
+	volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
+	if (!volume_info)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cifs_setup_volume_info(volume_info, mount_data, devname);
+	if (rc) {
+		cifs_cleanup_volume_info(volume_info);
+		volume_info = ERR_PTR(rc);
+	}
+
+	return volume_info;
+}
+
+int
+cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
+{
+	int rc;
+	unsigned int xid;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char   *full_path;
+	struct tcon_link *tlink;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int referral_walks_count = 0;
+#endif
+
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+	if (rc)
+		return rc;
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+try_mount_again:
+	/* cleanup activities if we're chasing a referral */
+	if (referral_walks_count) {
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (ses)
+			cifs_put_smb_ses(ses);
+
+		free_xid(xid);
+	}
+#endif
+	rc = 0;
+	tcon = NULL;
+	ses = NULL;
+	server = NULL;
+	full_path = NULL;
+	tlink = NULL;
+
+	xid = get_xid();
 
 	/* get a reference to a tcp session */
-	srvTcp = cifs_get_tcp_session(volume_info);
-	if (IS_ERR(srvTcp)) {
-		rc = PTR_ERR(srvTcp);
+	server = cifs_get_tcp_session(volume_info);
+	if (IS_ERR(server)) {
+		rc = PTR_ERR(server);
+		bdi_destroy(&cifs_sb->bdi);
 		goto out;
 	}
 
 	/* get a reference to a SMB session */
-	pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
-	if (IS_ERR(pSesInfo)) {
-		rc = PTR_ERR(pSesInfo);
-		pSesInfo = NULL;
+	ses = cifs_get_smb_ses(server, volume_info);
+	if (IS_ERR(ses)) {
+		rc = PTR_ERR(ses);
+		ses = NULL;
 		goto mount_fail_check;
 	}
 
-	setup_cifs_sb(volume_info, cifs_sb);
-	if (pSesInfo->capabilities & CAP_LARGE_FILES)
-		sb->s_maxbytes = MAX_LFS_FILESIZE;
-	else
-		sb->s_maxbytes = MAX_NON_LFS;
-
-	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
-	sb->s_time_gran = 100;
-
 	/* search for existing tcon to this server share */
-	tcon = cifs_get_tcon(pSesInfo, volume_info);
+	tcon = cifs_get_tcon(ses, volume_info);
 	if (IS_ERR(tcon)) {
 		rc = PTR_ERR(tcon);
 		tcon = NULL;
 		goto remote_path_check;
 	}
 
-	/* do not care if following two calls succeed - informational */
-	if (!tcon->ipc) {
-		CIFSSMBQFSDeviceInfo(xid, tcon);
-		CIFSSMBQFSAttributeInfo(xid, tcon);
-	}
-
 	/* tell server which Unix caps we support */
-	if (tcon->ses->capabilities & CAP_UNIX)
+	if (cap_unix(tcon->ses)) {
 		/* reset of caps checks mount to see if unix extensions
 		   disabled for just this mount */
-		reset_cifs_unix_caps(xid, tcon, sb, volume_info);
-	else
+		reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info);
+		if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
+		    (le64_to_cpu(tcon->fsUnixInfo.Capability) &
+		     CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
+			rc = -EACCES;
+			goto mount_fail_check;
+		}
+	} else
 		tcon->unix_ext = 0; /* server does not support them */
 
-	/* convert forward to back slashes in prepath here if needed */
-	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-		convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+	/* do not care if a following call succeed - informational */
+	if (!tcon->ipc && server->ops->qfs_tcon)
+		server->ops->qfs_tcon(xid, tcon);
 
-	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-		cifs_sb->rsize = 1024 * 127;
-		cFYI(DBG2, "no very large read support, rsize now 127K");
-	}
-	if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
-		cifs_sb->wsize = min(cifs_sb->wsize,
-			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
-	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-		cifs_sb->rsize = min(cifs_sb->rsize,
-			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+	cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
+	cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
+
+	/* tune readahead according to rsize */
+	cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
 
 remote_path_check:
-	/* check if a whole path (including prepath) is not remote */
-	if (!rc && cifs_sb->prepathlen && tcon) {
-		/* build_path_to_root works only when we have a valid tcon */
-		full_path = cifs_build_path_to_root(cifs_sb, tcon);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	/*
+	 * Perform an unconditional check for whether there are DFS
+	 * referrals for this path without prefix, to provide support
+	 * for DFS referrals from w2k8 servers which don't seem to respond
+	 * with PATH_NOT_COVERED to requests that include the prefix.
+	 * Chase the referral if found, otherwise continue normally.
+	 */
+	if (referral_walks_count == 0) {
+		int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb,
+						false);
+		if (!refrc) {
+			referral_walks_count++;
+			goto try_mount_again;
+		}
+	}
+#endif
+
+	/* check if a whole path is not remote */
+	if (!rc && tcon) {
+		if (!server->ops->is_path_accessible) {
+			rc = -ENOSYS;
+			goto mount_fail_check;
+		}
+		/*
+		 * cifs_build_path_to_root works only when we have a valid tcon
+		 */
+		full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
 		if (full_path == NULL) {
 			rc = -ENOMEM;
 			goto mount_fail_check;
 		}
-		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+		rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+						     full_path);
 		if (rc != 0 && rc != -EREMOTE) {
 			kfree(full_path);
 			goto mount_fail_check;
@@ -2854,50 +3557,14 @@ remote_path_check:
 			rc = -ELOOP;
 			goto mount_fail_check;
 		}
-		/* convert forward to back slashes in prepath here if needed */
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-			convert_delimiter(cifs_sb->prepath,
-					CIFS_DIR_SEP(cifs_sb));
-		full_path = build_unc_path_to_root(volume_info, cifs_sb);
-		if (IS_ERR(full_path)) {
-			rc = PTR_ERR(full_path);
-			goto mount_fail_check;
-		}
 
-		cFYI(1, "Getting referral for: %s", full_path);
-		rc = get_dfs_path(xid, pSesInfo , full_path + 1,
-			cifs_sb->local_nls, &num_referrals, &referrals,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (!rc && num_referrals > 0) {
-			char *fake_devname = NULL;
+		rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true);
 
-			if (mount_data != mount_data_global)
-				kfree(mount_data);
-
-			mount_data = cifs_compose_mount_options(
-					cifs_sb->mountdata, full_path + 1,
-					referrals, &fake_devname);
-
-			free_dfs_info_array(referrals, num_referrals);
-			kfree(fake_devname);
-			kfree(full_path);
-
-			if (IS_ERR(mount_data)) {
-				rc = PTR_ERR(mount_data);
-				mount_data = NULL;
-				goto mount_fail_check;
-			}
-
-			if (tcon)
-				cifs_put_tcon(tcon);
-			else if (pSesInfo)
-				cifs_put_smb_ses(pSesInfo);
-
-			cleanup_volume_info(&volume_info);
+		if (!rc) {
 			referral_walks_count++;
-			FreeXid(xid);
 			goto try_mount_again;
 		}
+		goto mount_fail_check;
 #else /* No DFS support, return error on mount */
 		rc = -EOPNOTSUPP;
 #endif
@@ -2913,7 +3580,7 @@ remote_path_check:
 		goto mount_fail_check;
 	}
 
-	tlink->tl_uid = pSesInfo->linux_uid;
+	tlink->tl_uid = ses->linux_uid;
 	tlink->tl_tcon = tcon;
 	tlink->tl_time = jiffies;
 	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
@@ -2924,39 +3591,35 @@ remote_path_check:
 	tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
 	spin_unlock(&cifs_sb->tlink_tree_lock);
 
-	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+	queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
 				TLINK_IDLE_EXPIRE);
 
 mount_fail_check:
 	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
-		if (mount_data != mount_data_global)
-			kfree(mount_data);
 		/* If find_unc succeeded then rc == 0 so we can not end */
-		/* up accidently freeing someone elses tcon struct */
+		/* up accidentally freeing someone elses tcon struct */
 		if (tcon)
 			cifs_put_tcon(tcon);
-		else if (pSesInfo)
-			cifs_put_smb_ses(pSesInfo);
+		else if (ses)
+			cifs_put_smb_ses(ses);
 		else
-			cifs_put_tcp_session(srvTcp);
-		goto out;
+			cifs_put_tcp_session(server);
+		bdi_destroy(&cifs_sb->bdi);
 	}
 
-	/* volume_info->password is freed above when existing session found
-	(in which case it is not needed anymore) but when new sesion is created
-	the password ptr is put in the new session structure (in which case the
-	password will be freed at unmount time) */
 out:
-	/* zero out password before freeing */
-	cleanup_volume_info(&volume_info);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
+/*
+ * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon
+ * pointer may be NULL.
+ */
 int
-CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
-	 const char *tree, struct cifsTconInfo *tcon,
+CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
+	 const char *tree, struct cifs_tcon *tcon,
 	 const struct nls_table *nls_codepage)
 {
 	struct smb_hdr *smb_buffer;
@@ -2980,7 +3643,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
 			NULL /*no tid */ , 4 /*wct */ );
 
-	smb_buffer->Mid = GetNextMid(ses->server);
+	smb_buffer->Mid = get_next_mid(ses->server);
 	smb_buffer->Uid = ses->Suid;
 	pSMB = (TCONX_REQ *) smb_buffer;
 	pSMBr = (TCONX_RSP *) smb_buffer_response;
@@ -2988,7 +3651,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->AndXCommand = 0xFF;
 	pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
 	bcc_ptr = &pSMB->Password[0];
-	if ((ses->server->secMode) & SECMODE_USER) {
+	if (!tcon || (ses->server->sec_mode & SECMODE_USER)) {
 		pSMB->PasswordLength = cpu_to_le16(1);	/* minimum */
 		*bcc_ptr = 0; /* password is null byte */
 		bcc_ptr++;              /* skip password */
@@ -3003,15 +3666,15 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
-		    (ses->server->secType == LANMAN))
+		    (ses->sectype == LANMAN))
 			calc_lanman_hash(tcon->password, ses->server->cryptkey,
-					 ses->server->secMode &
+					 ses->server->sec_mode &
 					    SECMODE_PW_ENCRYPT ? true : false,
 					 bcc_ptr);
 		else
 #endif /* CIFS_WEAK_PW_HASH */
 		rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
-					bcc_ptr);
+					bcc_ptr, nls_codepage);
 
 		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
@@ -3021,8 +3684,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		}
 	}
 
-	if (ses->server->secMode &
-			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (ses->server->sign)
 		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	if (ses->capabilities & CAP_STATUS32) {
@@ -3034,7 +3696,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	if (ses->capabilities & CAP_UNICODE) {
 		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
 		length =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+		    cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
 			6 /* max utf8 char length in bytes */ *
 			(/* server len*/ + 256 /* share len */), nls_codepage);
 		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
@@ -3047,7 +3709,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	bcc_ptr += strlen("?????");
 	bcc_ptr += 1;
 	count = bcc_ptr - &pSMB->Password[0];
-	pSMB->hdr.smb_buf_length += count;
+	pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu(
+					pSMB->hdr.smb_buf_length) + count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
@@ -3073,26 +3736,26 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		if (length == 3) {
 			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
 			    (bcc_ptr[2] == 'C')) {
-				cFYI(1, "IPC connection");
+				cifs_dbg(FYI, "IPC connection\n");
 				tcon->ipc = 1;
 			}
 		} else if (length == 2) {
 			if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
 				/* the most common case */
-				cFYI(1, "disk share connection");
+				cifs_dbg(FYI, "disk share connection\n");
 			}
 		}
 		bcc_ptr += length + 1;
 		bytes_left -= (length + 1);
-		strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+		strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
 
 		/* mostly informational -- no need to fail on error here */
 		kfree(tcon->nativeFileSystem);
-		tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+		tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
 						      bytes_left, is_unicode,
 						      nls_codepage);
 
-		cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
+		cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem);
 
 		if ((smb_buffer_response->WordCount == 3) ||
 			 (smb_buffer_response->WordCount == 7))
@@ -3100,7 +3763,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
 		else
 			tcon->Flags = 0;
-		cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
+		cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags);
 	} else if ((rc == 0) && tcon == NULL) {
 		/* all we need to save for IPC$ connection */
 		ses->ipc_tid = smb_buffer_response->Tid;
@@ -3110,13 +3773,19 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	return rc;
 }
 
-int
-cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
+static void delayed_free(struct rcu_head *p)
+{
+	struct cifs_sb_info *sbi = container_of(p, struct cifs_sb_info, rcu);
+	unload_nls(sbi->local_nls);
+	kfree(sbi);
+}
+
+void
+cifs_umount(struct cifs_sb_info *cifs_sb)
 {
 	struct rb_root *root = &cifs_sb->tlink_tree;
 	struct rb_node *node;
 	struct tcon_link *tlink;
-	char *tmp;
 
 	cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
 
@@ -3133,105 +3802,87 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	}
 	spin_unlock(&cifs_sb->tlink_tree_lock);
 
-	tmp = cifs_sb->prepath;
-	cifs_sb->prepathlen = 0;
-	cifs_sb->prepath = NULL;
-	kfree(tmp);
-
-	return 0;
+	bdi_destroy(&cifs_sb->bdi);
+	kfree(cifs_sb->mountdata);
+	call_rcu(&cifs_sb->rcu, delayed_free);
 }
 
-int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
+int
+cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
 {
 	int rc = 0;
 	struct TCP_Server_Info *server = ses->server;
 
+	if (!server->ops->need_neg || !server->ops->negotiate)
+		return -ENOSYS;
+
 	/* only send once per connect */
-	if (server->maxBuf != 0)
+	if (!server->ops->need_neg(server))
 		return 0;
 
-	rc = CIFSSMBNegotiate(xid, ses);
-	if (rc == -EAGAIN) {
-		/* retry only once on 1st time connection */
-		rc = CIFSSMBNegotiate(xid, ses);
-		if (rc == -EAGAIN)
-			rc = -EHOSTDOWN;
-	}
+	set_credits(server, 1);
+
+	rc = server->ops->negotiate(xid, ses);
 	if (rc == 0) {
 		spin_lock(&GlobalMid_Lock);
-		if (server->tcpStatus != CifsExiting)
+		if (server->tcpStatus == CifsNeedNegotiate)
 			server->tcpStatus = CifsGood;
 		else
 			rc = -EHOSTDOWN;
 		spin_unlock(&GlobalMid_Lock);
-
 	}
 
 	return rc;
 }
 
-
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
-			struct nls_table *nls_info)
+int
+cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+		   struct nls_table *nls_info)
 {
-	int rc = 0;
+	int rc = -ENOSYS;
 	struct TCP_Server_Info *server = ses->server;
 
-	ses->flags = 0;
 	ses->capabilities = server->capabilities;
 	if (linuxExtEnabled == 0)
-		ses->capabilities &= (~CAP_UNIX);
+		ses->capabilities &= (~server->vals->cap_unix);
 
-	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-		 server->secMode, server->capabilities, server->timeAdj);
+	cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
+		 server->sec_mode, server->capabilities, server->timeAdj);
 
-	rc = CIFS_SessSetup(xid, ses, nls_info);
-	if (rc) {
-		cERROR(1, "Send error in SessSetup = %d", rc);
-	} else {
-		mutex_lock(&ses->server->srv_mutex);
-		if (!server->session_estab) {
-			server->session_key.response = ses->auth_key.response;
-			server->session_key.len = ses->auth_key.len;
-			server->sequence_number = 0x2;
-			server->session_estab = true;
-			ses->auth_key.response = NULL;
-		}
-		mutex_unlock(&server->srv_mutex);
+	if (server->ops->sess_setup)
+		rc = server->ops->sess_setup(xid, ses, nls_info);
 
-		cFYI(1, "CIFS Session Established successfully");
-		spin_lock(&GlobalMid_Lock);
-		ses->status = CifsGood;
-		ses->need_reconnect = false;
-		spin_unlock(&GlobalMid_Lock);
-	}
-
-	kfree(ses->auth_key.response);
-	ses->auth_key.response = NULL;
-	ses->auth_key.len = 0;
-	kfree(ses->ntlmssp);
-	ses->ntlmssp = NULL;
+	if (rc)
+		cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
 
 	return rc;
 }
 
-static struct cifsTconInfo *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
 {
-	struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
-	struct cifsSesInfo *ses;
-	struct cifsTconInfo *tcon = NULL;
+	vol->sectype = ses->sectype;
+
+	/* krb5 is special, since we don't need username or pw */
+	if (vol->sectype == Kerberos)
+		return 0;
+
+	return cifs_set_cifscreds(vol, ses);
+}
+
+static struct cifs_tcon *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+{
+	int rc;
+	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon = NULL;
 	struct smb_vol *vol_info;
-	char username[MAX_USERNAME_SIZE + 1];
 
 	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
-	if (vol_info == NULL) {
-		tcon = ERR_PTR(-ENOMEM);
-		goto out;
-	}
+	if (vol_info == NULL)
+		return ERR_PTR(-ENOMEM);
 
-	snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
-	vol_info->username = username;
 	vol_info->local_nls = cifs_sb->local_nls;
 	vol_info->linux_uid = fsuid;
 	vol_info->cred_uid = fsuid;
@@ -3240,9 +3891,14 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	vol_info->nocase = master_tcon->nocase;
 	vol_info->local_lease = master_tcon->local_lease;
 	vol_info->no_linux_ext = !master_tcon->unix_ext;
+	vol_info->sectype = master_tcon->ses->sectype;
+	vol_info->sign = master_tcon->ses->sign;
 
-	/* FIXME: allow for other secFlg settings */
-	vol_info->secFlg = CIFSSEC_MUST_KRB5;
+	rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+	if (rc) {
+		tcon = ERR_PTR(rc);
+		goto out;
+	}
 
 	/* get a reference for the same TCP session */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -3251,7 +3907,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 
 	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
 	if (IS_ERR(ses)) {
-		tcon = (struct cifsTconInfo *)ses;
+		tcon = (struct cifs_tcon *)ses;
 		cifs_put_tcp_session(master_tcon->ses->server);
 		goto out;
 	}
@@ -3262,21 +3918,17 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 		goto out;
 	}
 
-	if (ses->capabilities & CAP_UNIX)
+	if (cap_unix(ses))
 		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
 out:
+	kfree(vol_info->username);
+	kfree(vol_info->password);
 	kfree(vol_info);
 
 	return tcon;
 }
 
-static inline struct tcon_link *
-cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
-{
-	return cifs_sb->master_tlink;
-}
-
-struct cifsTconInfo *
+struct cifs_tcon *
 cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
 {
 	return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
@@ -3291,7 +3943,7 @@ cifs_sb_tcon_pending_wait(void *unused)
 
 /* find and return a tlink with given uid */
 static struct tcon_link *
-tlink_rb_search(struct rb_root *root, uid_t uid)
+tlink_rb_search(struct rb_root *root, kuid_t uid)
 {
 	struct rb_node *node = root->rb_node;
 	struct tcon_link *tlink;
@@ -3299,9 +3951,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid)
 	while (node) {
 		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
 
-		if (tlink->tl_uid > uid)
+		if (uid_gt(tlink->tl_uid, uid))
 			node = node->rb_left;
-		else if (tlink->tl_uid < uid)
+		else if (uid_lt(tlink->tl_uid, uid))
 			node = node->rb_right;
 		else
 			return tlink;
@@ -3320,7 +3972,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
 		tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
 		parent = *new;
 
-		if (tlink->tl_uid > new_tlink->tl_uid)
+		if (uid_gt(tlink->tl_uid, new_tlink->tl_uid))
 			new = &((*new)->rb_left);
 		else
 			new = &((*new)->rb_right);
@@ -3350,7 +4002,7 @@ struct tcon_link *
 cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
 {
 	int ret;
-	uid_t fsuid = current_fsuid();
+	kuid_t fsuid = current_fsuid();
 	struct tcon_link *tlink, *newtlink;
 
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -3463,6 +4115,6 @@ cifs_prune_tlinks(struct work_struct *work)
 	}
 	spin_unlock(&cifs_sb->tlink_tree_lock);
 
-	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+	queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
 				TLINK_IDLE_EXPIRE);
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index dd5f22918c3..3db0c5fd9a1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -32,6 +32,7 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
 
 static void
 renew_parental_timestamps(struct dentry *direntry)
@@ -44,65 +45,101 @@ renew_parental_timestamps(struct dentry *direntry)
 	} while (!IS_ROOT(direntry));
 }
 
+char *
+cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+			struct cifs_tcon *tcon)
+{
+	int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+	int dfsplen;
+	char *full_path = NULL;
+
+	/* if no prefix path, simply set path to the root of share to "" */
+	if (pplen == 0) {
+		full_path = kzalloc(1, GFP_KERNEL);
+		return full_path;
+	}
+
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
+
+	full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+	if (full_path == NULL)
+		return full_path;
+
+	if (dfsplen)
+		strncpy(full_path, tcon->treeName, dfsplen);
+	full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
+	strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+	full_path[dfsplen + pplen] = 0; /* add trailing null */
+	return full_path;
+}
+
 /* Note: caller must free return buffer */
 char *
 build_path_from_dentry(struct dentry *direntry)
 {
 	struct dentry *temp;
 	int namelen;
-	int pplen;
 	int dfsplen;
 	char *full_path;
 	char dirsep;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-
-	if (direntry == NULL)
-		return NULL;  /* not much we can do if dentry is freed and
-		we need to reopen the file after it was closed implicitly
-		when the server crashed */
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	unsigned seq;
 
 	dirsep = CIFS_DIR_SEP(cifs_sb);
-	pplen = cifs_sb->prepathlen;
 	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
 		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 cifs_bp_rename_retry:
-	namelen = pplen + dfsplen;
+	namelen = dfsplen;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
 	for (temp = direntry; !IS_ROOT(temp);) {
 		namelen += (1 + temp->d_name.len);
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, "corrupt dentry");
+			cifs_dbg(VFS, "corrupt dentry\n");
+			rcu_read_unlock();
 			return NULL;
 		}
 	}
+	rcu_read_unlock();
 
 	full_path = kmalloc(namelen+1, GFP_KERNEL);
 	if (full_path == NULL)
 		return full_path;
 	full_path[namelen] = 0;	/* trailing null */
+	rcu_read_lock();
 	for (temp = direntry; !IS_ROOT(temp);) {
+		spin_lock(&temp->d_lock);
 		namelen -= 1 + temp->d_name.len;
 		if (namelen < 0) {
+			spin_unlock(&temp->d_lock);
 			break;
 		} else {
 			full_path[namelen] = dirsep;
 			strncpy(full_path + namelen + 1, temp->d_name.name,
 				temp->d_name.len);
-			cFYI(0, "name: %s", full_path + namelen);
+			cifs_dbg(FYI, "name: %s\n", full_path + namelen);
 		}
+		spin_unlock(&temp->d_lock);
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, "corrupt dentry");
+			cifs_dbg(VFS, "corrupt dentry\n");
+			rcu_read_unlock();
 			kfree(full_path);
 			return NULL;
 		}
 	}
-	if (namelen != pplen + dfsplen) {
-		cERROR(1, "did not end path lookup where expected namelen is %d",
-			namelen);
+	rcu_read_unlock();
+	if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+		cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
+			 namelen, dfsplen);
 		/* presumably this is only possible if racing with a rename
 		of one of the parent directories  (we can not lock the dentries
 		above us to prevent this, but retrying should be harmless) */
@@ -126,112 +163,152 @@ cifs_bp_rename_retry:
 			}
 		}
 	}
-	strncpy(full_path + dfsplen, CIFS_SB(direntry->d_sb)->prepath, pplen);
 	return full_path;
 }
 
+/*
+ * Don't allow the separator character in a path component.
+ * The VFS will not allow "/", but "\" is allowed by posix.
+ */
+static int
+check_name(struct dentry *direntry)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	int i;
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
+		for (i = 0; i < direntry->d_name.len; i++) {
+			if (direntry->d_name.name[i] == '\\') {
+				cifs_dbg(FYI, "Invalid file name\n");
+				return -EINVAL;
+			}
+		}
+	}
+	return 0;
+}
+
+
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 
-int
-cifs_create(struct inode *inode, struct dentry *direntry, int mode,
-		struct nameidata *nd)
+static int
+cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
+	       struct tcon_link *tlink, unsigned oflags, umode_t mode,
+	       __u32 *oplock, struct cifs_fid *fid)
 {
 	int rc = -ENOENT;
-	int xid;
 	int create_options = CREATE_NOT_DIR;
-	__u32 oplock = 0;
-	int oflags;
-	/*
-	 * BB below access is probably too much for mknod to request
-	 *    but we have to do query and setpathinfo so requesting
-	 *    less could fail (unless we want to request getatr and setatr
-	 *    permissions (only).  At least for POSIX we do not have to
-	 *    request so much.
-	 */
-	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
-	__u16 fileHandle;
-	struct cifs_sb_info *cifs_sb;
-	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	int desired_access;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_tcon *tcon = tlink_tcon(tlink);
 	char *full_path = NULL;
 	FILE_ALL_INFO *buf = NULL;
 	struct inode *newinode = NULL;
-	int disposition = FILE_OVERWRITE_IF;
+	int disposition;
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifs_open_parms oparms;
 
-	xid = GetXid();
-
-	cifs_sb = CIFS_SB(inode->i_sb);
-	tlink = cifs_sb_tlink(cifs_sb);
-	if (IS_ERR(tlink)) {
-		FreeXid(xid);
-		return PTR_ERR(tlink);
-	}
-	tcon = tlink_tcon(tlink);
-
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
-
-	if (nd && (nd->flags & LOOKUP_OPEN))
-		oflags = nd->intent.open.file->f_flags;
-	else
-		oflags = O_RDONLY | O_CREAT;
+	*oplock = 0;
+	if (tcon->ses->server->oplocks)
+		*oplock = REQ_OPLOCK;
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		goto cifs_create_out;
+		goto out;
 	}
 
-	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		rc = cifs_posix_open(full_path, &newinode,
-			inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
-		/* EIO could indicate that (posix open) operation is not
-		   supported, despite what server claimed in capability
-		   negotation.  EREMOTE indicates DFS junction, which is not
-		   handled in posix open */
-
-		if (rc == 0) {
-			if (newinode == NULL) /* query inode info */
+		rc = cifs_posix_open(full_path, &newinode, inode->i_sb, mode,
+				     oflags, oplock, &fid->netfid, xid);
+		switch (rc) {
+		case 0:
+			if (newinode == NULL) {
+				/* query inode info */
 				goto cifs_create_get_file_info;
-			else /* success, no need to query */
-				goto cifs_create_set_dentry;
-		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
-			 (rc != -EOPNOTSUPP) && (rc != -EINVAL))
-			goto cifs_create_out;
-		/* else fallthrough to retry, using older open call, this is
-		   case where server does not support this SMB level, and
-		   falsely claims capability (also get here for DFS case
-		   which should be rare for path not covered on files) */
-	}
+			}
+
+			if (!S_ISREG(newinode->i_mode)) {
+				/*
+				 * The server may allow us to open things like
+				 * FIFOs, but the client isn't set up to deal
+				 * with that. If it's not a regular file, just
+				 * close it and proceed as if it were a normal
+				 * lookup.
+				 */
+				CIFSSMBClose(xid, tcon, fid->netfid);
+				goto cifs_create_get_file_info;
+			}
+			/* success, no need to query */
+			goto cifs_create_set_dentry;
+
+		case -ENOENT:
+			goto cifs_create_get_file_info;
 
-	if (nd && (nd->flags & LOOKUP_OPEN)) {
-		/* if the file is going to stay open, then we
-		   need to set the desired access properly */
-		desiredAccess = 0;
-		if (OPEN_FMODE(oflags) & FMODE_READ)
-			desiredAccess |= GENERIC_READ; /* is this too little? */
-		if (OPEN_FMODE(oflags) & FMODE_WRITE)
-			desiredAccess |= GENERIC_WRITE;
-
-		if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-			disposition = FILE_CREATE;
-		else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
-			disposition = FILE_OVERWRITE_IF;
-		else if ((oflags & O_CREAT) == O_CREAT)
-			disposition = FILE_OPEN_IF;
-		else
-			cFYI(1, "Create flag not set in create function");
+		case -EIO:
+		case -EINVAL:
+			/*
+			 * EIO could indicate that (posix open) operation is not
+			 * supported, despite what server claimed in capability
+			 * negotiation.
+			 *
+			 * POSIX open in samba versions 3.3.1 and earlier could
+			 * incorrectly fail with invalid parameter.
+			 */
+			tcon->broken_posix_open = true;
+			break;
+
+		case -EREMOTE:
+		case -EOPNOTSUPP:
+			/*
+			 * EREMOTE indicates DFS junction, which is not handled
+			 * in posix open.  If either that or op not supported
+			 * returned, follow the normal lookup.
+			 */
+			break;
+
+		default:
+			goto out;
+		}
+		/*
+		 * fallthrough to retry, using older open call, this is case
+		 * where server does not support this SMB level, and falsely
+		 * claims capability (also get here for DFS case which should be
+		 * rare for path not covered on files)
+		 */
 	}
 
-	/* BB add processing to set equivalent of mode - e.g. via CreateX with
-	   ACLs */
+	desired_access = 0;
+	if (OPEN_FMODE(oflags) & FMODE_READ)
+		desired_access |= GENERIC_READ; /* is this too little? */
+	if (OPEN_FMODE(oflags) & FMODE_WRITE)
+		desired_access |= GENERIC_WRITE;
+
+	disposition = FILE_OVERWRITE_IF;
+	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		disposition = FILE_CREATE;
+	else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+		disposition = FILE_OVERWRITE_IF;
+	else if ((oflags & O_CREAT) == O_CREAT)
+		disposition = FILE_OPEN_IF;
+	else
+		cifs_dbg(FYI, "Create flag not set in create function\n");
+
+	/*
+	 * BB add processing to set equivalent of mode - e.g. via CreateX with
+	 * ACLs
+	 */
+
+	if (!server->ops->open) {
+		rc = -ENOSYS;
+		goto out;
+	}
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
 		rc = -ENOMEM;
-		goto cifs_create_out;
+		goto out;
 	}
 
 	/*
@@ -241,29 +318,29 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
-	if (tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-			 desiredAccess, create_options,
-			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	else
-		rc = -EIO; /* no NT SMB support fall into legacy open below */
-
-	if (rc == -EIO) {
-		/* old server, retry the open legacy style */
-		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-			desiredAccess, create_options,
-			&fileHandle, &oplock, buf, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	}
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = desired_access;
+	oparms.create_options = create_options;
+	oparms.disposition = disposition;
+	oparms.path = full_path;
+	oparms.fid = fid;
+	oparms.reconnect = false;
+
+	rc = server->ops->open(xid, &oparms, oplock, buf);
 	if (rc) {
-		cFYI(1, "cifs_create returned 0x%x", rc);
-		goto cifs_create_out;
+		cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
+		goto out;
 	}
 
-	/* If Open reported that we actually created a file
-	   then we now have to set the mode if possible */
-	if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+	/*
+	 * If Open reported that we actually created a file then we now have to
+	 * set the mode if possible.
+	 */
+	if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) {
 		struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
@@ -273,20 +350,22 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		};
 
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			args.uid = (__u64) current_fsuid();
+			args.uid = current_fsuid();
 			if (inode->i_mode & S_ISGID)
-				args.gid = (__u64) inode->i_gid;
+				args.gid = inode->i_gid;
 			else
-				args.gid = (__u64) current_fsgid();
+				args.gid = current_fsgid();
 		} else {
-			args.uid = NO_CHANGE_64;
-			args.gid = NO_CHANGE_64;
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
 		}
-		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
-					current->tgid);
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,
+				       current->tgid);
 	} else {
-		/* BB implement mode setting via Windows security
-		   descriptors e.g. */
+		/*
+		 * BB implement mode setting via Windows security
+		 * descriptors e.g.
+		 */
 		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
 		/* Could set r/o dos attribute if mode & 0222 == 0 */
@@ -295,15 +374,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 cifs_create_get_file_info:
 	/* server might mask mode so we have to query for it */
 	if (tcon->unix_ext)
-		rc = cifs_get_inode_info_unix(&newinode, full_path,
-					      inode->i_sb, xid);
+		rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
+					      xid);
 	else {
-		rc = cifs_get_inode_info(&newinode, full_path, buf,
-					 inode->i_sb, xid, &fileHandle);
+		rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
+					 xid, fid);
 		if (newinode) {
+			if (server->ops->set_lease_key)
+				server->ops->set_lease_key(newinode, fid);
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
 				newinode->i_mode = mode;
-			if ((oplock & CIFS_CREATE_ACTION) &&
+			if ((*oplock & CIFS_CREATE_ACTION) &&
 			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
 				newinode->i_uid = current_fsuid();
 				if (inode->i_mode & S_ISGID)
@@ -315,52 +396,182 @@ cifs_create_get_file_info:
 	}
 
 cifs_create_set_dentry:
-	if (rc == 0)
-		d_instantiate(direntry, newinode);
-	else
-		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+	if (rc != 0) {
+		cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
+			 rc);
+		if (server->ops->close)
+			server->ops->close(xid, tcon, fid);
+		goto out;
+	}
+	d_drop(direntry);
+	d_add(direntry, newinode);
 
-	if (newinode && nd && (nd->flags & LOOKUP_OPEN)) {
-		struct cifsFileInfo *pfile_info;
-		struct file *filp;
+out:
+	kfree(buf);
+	kfree(full_path);
+	return rc;
+}
 
-		filp = lookup_instantiate_filp(nd, direntry, generic_file_open);
-		if (IS_ERR(filp)) {
-			rc = PTR_ERR(filp);
-			CIFSSMBClose(xid, tcon, fileHandle);
-			goto cifs_create_out;
-		}
+int
+cifs_atomic_open(struct inode *inode, struct dentry *direntry,
+		 struct file *file, unsigned oflags, umode_t mode,
+		 int *opened)
+{
+	int rc;
+	unsigned int xid;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_fid fid;
+	struct cifs_pending_open open;
+	__u32 oplock;
+	struct cifsFileInfo *file_info;
 
-		pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
-		if (pfile_info == NULL) {
-			fput(filp);
-			CIFSSMBClose(xid, tcon, fileHandle);
-			rc = -ENOMEM;
-		}
-	} else {
-		CIFSSMBClose(xid, tcon, fileHandle);
+	/*
+	 * Posix open is only called (at lookup time) for file create now. For
+	 * opens (rather than creates), because we do not know if it is a file
+	 * or directory yet, and current Samba no longer allows us to do posix
+	 * open on dirs, we could end up wasting an open call on what turns out
+	 * to be a dir. For file opens, we wait to call posix open till
+	 * cifs_open.  It could be added to atomic_open in the future but the
+	 * performance tradeoff of the extra network request when EISDIR or
+	 * EACCES is returned would have to be weighed against the 50% reduction
+	 * in network traffic in the other paths.
+	 */
+	if (!(oflags & O_CREAT)) {
+		struct dentry *res;
+
+		/*
+		 * Check for hashed negative dentry. We have already revalidated
+		 * the dentry and it is fine. No need to perform another lookup.
+		 */
+		if (!d_unhashed(direntry))
+			return -ENOENT;
+
+		res = cifs_lookup(inode, direntry, 0);
+		if (IS_ERR(res))
+			return PTR_ERR(res);
+
+		return finish_no_open(file, res);
 	}
 
-cifs_create_out:
-	kfree(buf);
-	kfree(full_path);
+	rc = check_name(direntry);
+	if (rc)
+		return rc;
+
+	xid = get_xid();
+
+	cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+		 inode, direntry->d_name.name, direntry);
+
+	tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto out_free_xid;
+	}
+
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (server->ops->new_lease_key)
+		server->ops->new_lease_key(&fid);
+
+	cifs_add_pending_open(&fid, tlink, &open);
+
+	rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
+			    &oplock, &fid);
+
+	if (rc) {
+		cifs_del_pending_open(&open);
+		goto out;
+	}
+
+	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		*opened |= FILE_CREATED;
+
+	rc = finish_open(file, direntry, generic_file_open, opened);
+	if (rc) {
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &fid);
+		cifs_del_pending_open(&open);
+		goto out;
+	}
+
+	file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
+	if (file_info == NULL) {
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &fid);
+		cifs_del_pending_open(&open);
+		fput(file);
+		rc = -ENOMEM;
+	}
+
+out:
 	cifs_put_tlink(tlink);
-	FreeXid(xid);
+out_free_xid:
+	free_xid(xid);
 	return rc;
 }
 
-int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
+int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
+		bool excl)
+{
+	int rc;
+	unsigned int xid = get_xid();
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
+	unsigned oflags = O_EXCL | O_CREAT | O_RDWR;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_fid fid;
+	__u32 oplock;
+
+	cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+		 inode, direntry->d_name.name, direntry);
+
+	tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
+	rc = PTR_ERR(tlink);
+	if (IS_ERR(tlink))
+		goto out_free_xid;
+
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (server->ops->new_lease_key)
+		server->ops->new_lease_key(&fid);
+
+	rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
+			    &oplock, &fid);
+	if (!rc && server->ops->close)
+		server->ops->close(xid, tcon, &fid);
+
+	cifs_put_tlink(tlink);
+out_free_xid:
+	free_xid(xid);
+	return rc;
+}
+
+int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 		dev_t device_number)
 {
 	int rc = -EPERM;
-	int xid;
+	unsigned int xid;
+	int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *tcon;
+	struct cifs_io_parms io_parms;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
 	int oplock = 0;
-	u16 fileHandle;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	FILE_ALL_INFO *buf = NULL;
 	unsigned int bytes_written;
 	struct win_dev *pdev;
@@ -373,9 +584,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -383,7 +594,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		goto mknod_out;
 	}
 
-	if (pTcon->unix_ext) {
+	if (tcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
 			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
@@ -392,13 +603,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 			.device	= device_number,
 		};
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			args.uid = (__u64) current_fsuid();
-			args.gid = (__u64) current_fsgid();
+			args.uid = current_fsuid();
+			args.gid = current_fsgid();
 		} else {
-			args.uid = NO_CHANGE_64;
-			args.gid = NO_CHANGE_64;
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
 		}
-		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
+		rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -417,52 +628,57 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		goto mknod_out;
 
 
-	cFYI(1, "sfu compat create special file");
+	cifs_dbg(FYI, "sfu compat create special file\n");
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
 		kfree(full_path);
 		rc = -ENOMEM;
-		FreeXid(xid);
+		free_xid(xid);
 		return rc;
 	}
 
-	/* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
-	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
-			 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
-			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, buf);
 	if (rc)
 		goto mknod_out;
 
-	/* BB Do not bother to decode buf since no local inode yet to put
-	 * timestamps in, but we can reuse it safely */
+	/*
+	 * BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely.
+	 */
 
 	pdev = (struct win_dev *)buf;
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = sizeof(struct win_dev);
 	if (S_ISCHR(mode)) {
 		memcpy(pdev->type, "IntxCHR", 8);
-		pdev->major =
-		      cpu_to_le64(MAJOR(device_number));
-		pdev->minor =
-		      cpu_to_le64(MINOR(device_number));
-		rc = CIFSSMBWrite(xid, pTcon,
-			fileHandle,
-			sizeof(struct win_dev),
-			0, &bytes_written, (char *)pdev,
-			NULL, 0);
+		pdev->major = cpu_to_le64(MAJOR(device_number));
+		pdev->minor = cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
+				  NULL, 0);
 	} else if (S_ISBLK(mode)) {
 		memcpy(pdev->type, "IntxBLK", 8);
-		pdev->major =
-		      cpu_to_le64(MAJOR(device_number));
-		pdev->minor =
-		      cpu_to_le64(MINOR(device_number));
-		rc = CIFSSMBWrite(xid, pTcon,
-			fileHandle,
-			sizeof(struct win_dev),
-			0, &bytes_written, (char *)pdev,
-			NULL, 0);
+		pdev->major = cpu_to_le64(MAJOR(device_number));
+		pdev->minor = cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
+				  NULL, 0);
 	} /* else if (S_ISFIFO) */
-	CIFSSMBClose(xid, pTcon, fileHandle);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	d_drop(direntry);
 
 	/* FIXME: add code here to set EAs */
@@ -470,66 +686,41 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 mknod_out:
 	kfree(full_path);
 	kfree(buf);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
 
 struct dentry *
 cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
-	    struct nameidata *nd)
+	    unsigned int flags)
 {
-	int xid;
+	unsigned int xid;
 	int rc = 0; /* to get around spurious gcc warning, set to zero here */
-	__u32 oplock = 0;
-	__u16 fileHandle = 0;
-	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
-	struct cifsFileInfo *cfile;
+	struct cifs_tcon *pTcon;
 	struct inode *newInode = NULL;
 	char *full_path = NULL;
-	struct file *filp;
 
-	xid = GetXid();
+	xid = get_xid();
 
-	cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
-	      parent_dir_inode, direntry->d_name.name, direntry);
+	cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+		 parent_dir_inode, direntry->d_name.name, direntry);
 
 	/* check whether path exists */
 
 	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
-		FreeXid(xid);
+		free_xid(xid);
 		return (struct dentry *)tlink;
 	}
 	pTcon = tlink_tcon(tlink);
 
-	/*
-	 * Don't allow the separator character in a path component.
-	 * The VFS will not allow "/", but "\" is allowed by posix.
-	 */
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
-		int i;
-		for (i = 0; i < direntry->d_name.len; i++)
-			if (direntry->d_name.name[i] == '\\') {
-				cFYI(1, "Invalid file name");
-				rc = -EINVAL;
-				goto lookup_out;
-			}
-	}
-
-	/*
-	 * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
-	 * the VFS handle the create.
-	 */
-	if (nd && (nd->flags & LOOKUP_EXCL)) {
-		d_instantiate(direntry, NULL);
-		rc = 0;
+	rc = check_name(direntry);
+	if (rc)
 		goto lookup_out;
-	}
 
 	/* can not grab the rename sem here since it would
 	deadlock in the cases (beginning of sys_rename itself)
@@ -541,70 +732,23 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	}
 
 	if (direntry->d_inode != NULL) {
-		cFYI(1, "non-NULL inode in lookup");
+		cifs_dbg(FYI, "non-NULL inode in lookup\n");
 	} else {
-		cFYI(1, "NULL inode in lookup");
+		cifs_dbg(FYI, "NULL inode in lookup\n");
 	}
-	cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
-
-	/* Posix open is only called (at lookup time) for file create now.
-	 * For opens (rather than creates), because we do not know if it
-	 * is a file or directory yet, and current Samba no longer allows
-	 * us to do posix open on dirs, we could end up wasting an open call
-	 * on what turns out to be a dir. For file opens, we wait to call posix
-	 * open till cifs_open.  It could be added here (lookup) in the future
-	 * but the performance tradeoff of the extra network request when EISDIR
-	 * or EACCES is returned would have to be weighed against the 50%
-	 * reduction in network traffic in the other paths.
-	 */
+	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
+		 full_path, direntry->d_inode);
+
 	if (pTcon->unix_ext) {
-		if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
-		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
-		     (nd->intent.open.file->f_flags & O_CREAT)) {
-			rc = cifs_posix_open(full_path, &newInode,
-					parent_dir_inode->i_sb,
-					nd->intent.open.create_mode,
-					nd->intent.open.file->f_flags, &oplock,
-					&fileHandle, xid);
-			/*
-			 * The check below works around a bug in POSIX
-			 * open in samba versions 3.3.1 and earlier where
-			 * open could incorrectly fail with invalid parameter.
-			 * If either that or op not supported returned, follow
-			 * the normal lookup.
-			 */
-			if ((rc == 0) || (rc == -ENOENT))
-				posix_open = true;
-			else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP))
-				pTcon->broken_posix_open = true;
-		}
-		if (!posix_open)
-			rc = cifs_get_inode_info_unix(&newInode, full_path,
-						parent_dir_inode->i_sb, xid);
-	} else
+		rc = cifs_get_inode_info_unix(&newInode, full_path,
+					      parent_dir_inode->i_sb, xid);
+	} else {
 		rc = cifs_get_inode_info(&newInode, full_path, NULL,
 				parent_dir_inode->i_sb, xid, NULL);
+	}
 
 	if ((rc == 0) && (newInode != NULL)) {
 		d_add(direntry, newInode);
-		if (posix_open) {
-			filp = lookup_instantiate_filp(nd, direntry,
-						       generic_file_open);
-			if (IS_ERR(filp)) {
-				rc = PTR_ERR(filp);
-				CIFSSMBClose(xid, pTcon, fileHandle);
-				goto lookup_out;
-			}
-
-			cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
-						  oplock);
-			if (cfile == NULL) {
-				fput(filp);
-				CIFSSMBClose(xid, pTcon, fileHandle);
-				rc = -ENOMEM;
-				goto lookup_out;
-			}
-		}
 		/* since paths are not looked up by component - the parent
 		   directories are presumed to be good here */
 		renew_parental_timestamps(direntry);
@@ -616,7 +760,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
 	} else if (rc != -EACCES) {
-		cERROR(1, "Unexpected lookup error %d", rc);
+		cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
 		/* We special case check for Access Denied - since that
 		is a common return code */
 	}
@@ -624,28 +768,43 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 lookup_out:
 	kfree(full_path);
 	cifs_put_tlink(tlink);
-	FreeXid(xid);
+	free_xid(xid);
 	return ERR_PTR(rc);
 }
 
 static int
-cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
+cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	if (direntry->d_inode) {
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
-		else
+		else {
+			/*
+			 * If the inode wasn't known to be a dfs entry when
+			 * the dentry was instantiated, such as when created
+			 * via ->readdir(), it needs to be set now since the
+			 * attributes will have been updated by
+			 * cifs_revalidate_dentry().
+			 */
+			if (IS_AUTOMOUNT(direntry->d_inode) &&
+			   !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
+				spin_lock(&direntry->d_lock);
+				direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+				spin_unlock(&direntry->d_lock);
+			}
+
 			return 1;
+		}
 	}
 
 	/*
 	 * This may be nfsd (or something), anyway, we can't see the
 	 * intent of this. So, since this can be for creation, drop it.
 	 */
-	if (!nd)
+	if (!flags)
 		return 0;
 
 	/*
@@ -653,10 +812,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	 * case sensitive name which is specified by user if this is
 	 * for creation.
 	 */
-	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
-		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
-			return 0;
-	}
+	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+		return 0;
 
 	if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
 		return 0;
@@ -668,7 +825,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
 	int rc = 0;
 
-	cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
+	cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name);
 
 	return rc;
 }     */
@@ -679,33 +836,71 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
-static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
 {
 	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
-	int i;
+	wchar_t c;
+	int i, charlen;
 
 	hash = init_name_hash();
-	for (i = 0; i < q->len; i++)
-		hash = partial_name_hash(nls_tolower(codepage, q->name[i]),
-					 hash);
+	for (i = 0; i < q->len; i += charlen) {
+		charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+		/* error out if we can't convert the character */
+		if (unlikely(charlen < 0))
+			return charlen;
+		hash = partial_name_hash(cifs_toupper(c), hash);
+	}
 	q->hash = end_name_hash(hash);
 
 	return 0;
 }
 
-static int cifs_ci_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
+	wchar_t c1, c2;
+	int i, l1, l2;
 
-	if ((name->len == len) &&
-	    (nls_strnicmp(codepage, name->name, str, len) == 0))
-		return 0;
-	return 1;
+	/*
+	 * We make the assumption here that uppercase characters in the local
+	 * codepage are always the same length as their lowercase counterparts.
+	 *
+	 * If that's ever not the case, then this will fail to match it.
+	 */
+	if (name->len != len)
+		return 1;
+
+	for (i = 0; i < len; i += l1) {
+		/* Convert characters in both strings to UTF-16. */
+		l1 = codepage->char2uni(&str[i], len - i, &c1);
+		l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+
+		/*
+		 * If we can't convert either character, just declare it to
+		 * be 1 byte long and compare the original byte.
+		 */
+		if (unlikely(l1 < 0 && l2 < 0)) {
+			if (str[i] != name->name[i])
+				return 1;
+			l1 = 1;
+			continue;
+		}
+
+		/*
+		 * Here, we again ass|u|me that upper/lowercase versions of
+		 * a character are the same length in the local NLS.
+		 */
+		if (l1 != l2)
+			return 1;
+
+		/* Now compare uppercase versions of these characters */
+		if (cifs_toupper(c1) != cifs_toupper(c2))
+			return 1;
+	}
+
+	return 0;
 }
 
 const struct dentry_operations cifs_ci_dentry_ops = {
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 548f06230a6..7ede7306599 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -34,7 +34,7 @@
 
 /**
  * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- * @unc: UNC path specifying the server
+ * @unc: UNC path specifying the server (with '/' as delimiter)
  * @ip_addr: Where to return the IP address.
  *
  * The IP address will be returned in string form, and the caller is
@@ -55,7 +55,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 
 	len = strlen(unc);
 	if (len < 3) {
-		cFYI(1, "%s: unc is too short: %s", __func__, unc);
+		cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc);
 		return -EINVAL;
 	}
 
@@ -64,12 +64,12 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	hostname = unc + 2;
 
 	/* Search for server name delimiter */
-	sep = memchr(hostname, '\\', len);
+	sep = memchr(hostname, '/', len);
 	if (sep)
 		len = sep - hostname;
 	else
-		cFYI(1, "%s: probably server name is whole unc: %s",
-		     __func__, unc);
+		cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n",
+			 __func__, unc);
 
 	/* Try to interpret hostname as an IPv4 or IPv6 address */
 	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
@@ -79,11 +79,11 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	/* Perform the upcall */
 	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
 	if (rc < 0)
-		cERROR(1, "%s: unable to resolve: %*.*s",
-		       __func__, len, len, hostname);
+		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
+			 __func__, len, len, hostname);
 	else
-		cFYI(1, "%s: resolved: %*.*s to %s",
-		     __func__, len, len, hostname, *ip_addr);
+		cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n",
+			 __func__, len, len, hostname, *ip_addr);
 	return rc;
 
 name_is_IP_address:
@@ -92,7 +92,8 @@ name_is_IP_address:
 		return -ENOMEM;
 	memcpy(name, hostname, len);
 	name[len] = 0;
-	cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+	cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n",
+		 __func__, name);
 	*ip_addr = name;
 	return 0;
 }
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 993f82045bf..ce8b7f677c5 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,11 +45,11 @@
 #include "cifs_debug.h"
 #include "cifsfs.h"
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
 	/* BB need to add code here eventually to enable export via NFSD */
-	cFYI(1, "get parent for %p", dentry);
+	cifs_dbg(FYI, "get parent for %p\n", dentry);
 	return ERR_PTR(-EACCES);
 }
 
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
 	.encode_fs =  */
 };
 
-#endif /* EXPERIMENTAL */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e964b1cd5dd..e90a1e9aa62 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/swap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -42,6 +43,7 @@
 #include "cifs_fs_sb.h"
 #include "fscache.h"
 
+
 static inline int cifs_convert_flags(unsigned int flags)
 {
 	if ((flags & O_ACCMODE) == O_RDONLY)
@@ -71,10 +73,14 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
 	else if ((flags & O_ACCMODE) == O_RDWR)
 		posix_flags = SMB_O_RDWR;
 
-	if (flags & O_CREAT)
+	if (flags & O_CREAT) {
 		posix_flags |= SMB_O_CREAT;
-	if (flags & O_EXCL)
-		posix_flags |= SMB_O_EXCL;
+		if (flags & O_EXCL)
+			posix_flags |= SMB_O_EXCL;
+	} else if (flags & O_EXCL)
+		cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n",
+			 current->comm, current->tgid);
+
 	if (flags & O_TRUNC)
 		posix_flags |= SMB_O_TRUNC;
 	/* be safe and imply O_SYNC for O_DSYNC */
@@ -106,7 +112,7 @@ static inline int cifs_get_disposition(unsigned int flags)
 
 int cifs_posix_open(char *full_path, struct inode **pinode,
 			struct super_block *sb, int mode, unsigned int f_flags,
-			__u32 *poplock, __u16 *pnetfid, int xid)
+			__u32 *poplock, __u16 *pnetfid, unsigned int xid)
 {
 	int rc;
 	FILE_UNIX_BASIC_INFO *presp_data;
@@ -114,9 +120,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifs_fattr fattr;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 
-	cFYI(1, "posix open %s", full_path);
+	cifs_dbg(FYI, "posix open %s\n", full_path);
 
 	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
 	if (presp_data == NULL)
@@ -168,15 +174,21 @@ posix_open_ret:
 
 static int
 cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
-	     struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
-	     __u16 *pnetfid, int xid)
+	     struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
+	     struct cifs_fid *fid, unsigned int xid)
 {
 	int rc;
-	int desiredAccess;
+	int desired_access;
 	int disposition;
+	int create_options = CREATE_NOT_DIR;
 	FILE_ALL_INFO *buf;
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifs_open_parms oparms;
 
-	desiredAccess = cifs_convert_flags(f_flags);
+	if (!server->ops->open)
+		return -ENOSYS;
+
+	desired_access = cifs_convert_flags(f_flags);
 
 /*********************************************************************
  *  open flag mapping table:
@@ -210,16 +222,19 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
 	if (!buf)
 		return -ENOMEM;
 
-	if (tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-			 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
-			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	else
-		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
-			desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
-			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
-				& CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = desired_access;
+	oparms.create_options = create_options;
+	oparms.disposition = disposition;
+	oparms.path = full_path;
+	oparms.fid = fid;
+	oparms.reconnect = false;
+
+	rc = server->ops->open(xid, &oparms, oplock, buf);
 
 	if (rc)
 		goto out;
@@ -229,52 +244,110 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
 					      xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-					 xid, pnetfid);
+					 xid, fid);
 
 out:
 	kfree(buf);
 	return rc;
 }
 
+static bool
+cifs_has_mand_locks(struct cifsInodeInfo *cinode)
+{
+	struct cifs_fid_locks *cur;
+	bool has_locks = false;
+
+	down_read(&cinode->lock_sem);
+	list_for_each_entry(cur, &cinode->llist, llist) {
+		if (!list_empty(&cur->locks)) {
+			has_locks = true;
+			break;
+		}
+	}
+	up_read(&cinode->lock_sem);
+	return has_locks;
+}
+
 struct cifsFileInfo *
-cifs_new_fileinfo(__u16 fileHandle, struct file *file,
+cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 		  struct tcon_link *tlink, __u32 oplock)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
-	struct cifsFileInfo *pCifsFile;
-
-	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-	if (pCifsFile == NULL)
-		return pCifsFile;
-
-	pCifsFile->count = 1;
-	pCifsFile->netfid = fileHandle;
-	pCifsFile->pid = current->tgid;
-	pCifsFile->uid = current_fsuid();
-	pCifsFile->dentry = dget(dentry);
-	pCifsFile->f_flags = file->f_flags;
-	pCifsFile->invalidHandle = false;
-	pCifsFile->tlink = cifs_get_tlink(tlink);
-	mutex_init(&pCifsFile->fh_mutex);
-	mutex_init(&pCifsFile->lock_mutex);
-	INIT_LIST_HEAD(&pCifsFile->llist);
-	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifsFileInfo *cfile;
+	struct cifs_fid_locks *fdlocks;
+	struct cifs_tcon *tcon = tlink_tcon(tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (cfile == NULL)
+		return cfile;
+
+	fdlocks = kzalloc(sizeof(struct cifs_fid_locks), GFP_KERNEL);
+	if (!fdlocks) {
+		kfree(cfile);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&fdlocks->locks);
+	fdlocks->cfile = cfile;
+	cfile->llist = fdlocks;
+	down_write(&cinode->lock_sem);
+	list_add(&fdlocks->llist, &cinode->llist);
+	up_write(&cinode->lock_sem);
+
+	cfile->count = 1;
+	cfile->pid = current->tgid;
+	cfile->uid = current_fsuid();
+	cfile->dentry = dget(dentry);
+	cfile->f_flags = file->f_flags;
+	cfile->invalidHandle = false;
+	cfile->tlink = cifs_get_tlink(tlink);
+	INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
+	mutex_init(&cfile->fh_mutex);
+
+	cifs_sb_active(inode->i_sb);
+
+	/*
+	 * If the server returned a read oplock and we have mandatory brlocks,
+	 * set oplock level to None.
+	 */
+	if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) {
+		cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
+		oplock = 0;
+	}
 
 	spin_lock(&cifs_file_list_lock);
-	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
+	if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
+		oplock = fid->pending_open->oplock;
+	list_del(&fid->pending_open->olist);
+
+	fid->purge_cache = false;
+	server->ops->set_fid(cfile, fid, oplock);
+
+	list_add(&cfile->tlist, &tcon->openFileList);
 	/* if readable file instance put first in list*/
 	if (file->f_mode & FMODE_READ)
-		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+		list_add(&cfile->flist, &cinode->openFileList);
 	else
-		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
+		list_add_tail(&cfile->flist, &cinode->openFileList);
 	spin_unlock(&cifs_file_list_lock);
 
-	cifs_set_oplock_level(pCifsInode, oplock);
+	if (fid->purge_cache)
+		cifs_zap_mapping(inode);
+
+	file->private_data = cfile;
+	return cfile;
+}
 
-	file->private_data = pCifsFile;
-	return pCifsFile;
+struct cifsFileInfo *
+cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+	spin_lock(&cifs_file_list_lock);
+	cifsFileInfo_get_locked(cifs_file);
+	spin_unlock(&cifs_file_list_lock);
+	return cifs_file;
 }
 
 /*
@@ -285,10 +358,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
 	struct inode *inode = cifs_file->dentry->d_inode;
-	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsLockInfo *li, *tmp;
+	struct cifs_fid fid;
+	struct cifs_pending_open open;
 
 	spin_lock(&cifs_file_list_lock);
 	if (--cifs_file->count > 0) {
@@ -296,69 +373,90 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 		return;
 	}
 
+	if (server->ops->get_lease_key)
+		server->ops->get_lease_key(inode, &fid);
+
+	/* store open in pending opens to make sure we don't miss lease break */
+	cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open);
+
 	/* remove it from the lists */
 	list_del(&cifs_file->flist);
 	list_del(&cifs_file->tlist);
 
 	if (list_empty(&cifsi->openFileList)) {
-		cFYI(1, "closing last open instance for inode %p",
-			cifs_file->dentry->d_inode);
-
-		/* in strict cache mode we need invalidate mapping on the last
-		   close  because it may cause a error when we open this file
-		   again and get at least level II oplock */
+		cifs_dbg(FYI, "closing last open instance for inode %p\n",
+			 cifs_file->dentry->d_inode);
+		/*
+		 * In strict cache mode we need invalidate mapping on the last
+		 * close  because it may cause a error when we open this file
+		 * again and get at least level II oplock.
+		 */
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
-			CIFS_I(inode)->invalid_mapping = true;
-
+			set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags);
 		cifs_set_oplock_level(cifsi, 0);
 	}
 	spin_unlock(&cifs_file_list_lock);
 
+	cancel_work_sync(&cifs_file->oplock_break);
+
 	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
-		int xid, rc;
+		struct TCP_Server_Info *server = tcon->ses->server;
+		unsigned int xid;
 
-		xid = GetXid();
-		rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
-		FreeXid(xid);
+		xid = get_xid();
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &cifs_file->fid);
+		_free_xid(xid);
 	}
 
-	/* Delete any outstanding lock records. We'll lose them when the file
+	cifs_del_pending_open(&open);
+
+	/*
+	 * Delete any outstanding lock records. We'll lose them when the file
 	 * is closed anyway.
 	 */
-	mutex_lock(&cifs_file->lock_mutex);
-	list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+	down_write(&cifsi->lock_sem);
+	list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
 		list_del(&li->llist);
+		cifs_del_lock_waiters(li);
 		kfree(li);
 	}
-	mutex_unlock(&cifs_file->lock_mutex);
+	list_del(&cifs_file->llist->llist);
+	kfree(cifs_file->llist);
+	up_write(&cifsi->lock_sem);
 
 	cifs_put_tlink(cifs_file->tlink);
 	dput(cifs_file->dentry);
+	cifs_sb_deactive(sb);
 	kfree(cifs_file);
 }
 
 int cifs_open(struct inode *inode, struct file *file)
+
 {
 	int rc = -EACCES;
-	int xid;
+	unsigned int xid;
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_tcon *tcon;
 	struct tcon_link *tlink;
-	struct cifsFileInfo *pCifsFile = NULL;
+	struct cifsFileInfo *cfile = NULL;
 	char *full_path = NULL;
 	bool posix_open_ok = false;
-	__u16 netfid;
+	struct cifs_fid fid;
+	struct cifs_pending_open open;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
-		FreeXid(xid);
+		free_xid(xid);
 		return PTR_ERR(tlink);
 	}
 	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
@@ -366,51 +464,61 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
+	cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
 		 inode, file->f_flags, full_path);
 
-	if (oplockEnabled)
+	if (server->oplocks)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
 
 	if (!tcon->broken_posix_open && tcon->unix_ext &&
-	    (tcon->ses->capabilities & CAP_UNIX) &&
-	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
-			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+	    cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
 		/* can not refresh inode info since size could be stale */
 		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
 				cifs_sb->mnt_file_mode /* ignored */,
-				file->f_flags, &oplock, &netfid, xid);
+				file->f_flags, &oplock, &fid.netfid, xid);
 		if (rc == 0) {
-			cFYI(1, "posix open succeeded");
+			cifs_dbg(FYI, "posix open succeeded\n");
 			posix_open_ok = true;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
-				cERROR(1, "server %s of type %s returned"
-					   " unexpected error on SMB posix open"
-					   ", disabling posix open support."
-					   " Check if server update available.",
-					   tcon->ses->serverName,
-					   tcon->ses->serverNOS);
+				cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n",
+					 tcon->ses->serverName,
+					 tcon->ses->serverNOS);
 			tcon->broken_posix_open = true;
 		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
 			 (rc != -EOPNOTSUPP)) /* path not found or net err */
 			goto out;
-		/* else fallthrough to retry open the old way on network i/o
-		   or DFS errors */
+		/*
+		 * Else fallthrough to retry open the old way on network i/o
+		 * or DFS errors.
+		 */
 	}
 
+	if (server->ops->get_lease_key)
+		server->ops->get_lease_key(inode, &fid);
+
+	cifs_add_pending_open(&fid, tlink, &open);
+
 	if (!posix_open_ok) {
+		if (server->ops->get_lease_key)
+			server->ops->get_lease_key(inode, &fid);
+
 		rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
-				  file->f_flags, &oplock, &netfid, xid);
-		if (rc)
+				  file->f_flags, &oplock, &fid, xid);
+		if (rc) {
+			cifs_del_pending_open(&open);
 			goto out;
+		}
 	}
 
-	pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
-	if (pCifsFile == NULL) {
-		CIFSSMBClose(xid, tcon, netfid);
+	cfile = cifs_new_fileinfo(&fid, file, tlink, oplock);
+	if (cfile == NULL) {
+		if (server->ops->close)
+			server->ops->close(xid, tcon, &fid);
+		cifs_del_pending_open(&open);
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -418,165 +526,217 @@ int cifs_open(struct inode *inode, struct file *file)
 	cifs_fscache_set_inode_cookie(inode, file);
 
 	if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
-		/* time to set mode which we can not set earlier due to
-		   problems creating new read-only files */
+		/*
+		 * Time to set mode which we can not set earlier due to
+		 * problems creating new read-only files.
+		 */
 		struct cifs_unix_set_info_args args = {
 			.mode	= inode->i_mode,
-			.uid	= NO_CHANGE_64,
-			.gid	= NO_CHANGE_64,
+			.uid	= INVALID_UID, /* no change */
+			.gid	= INVALID_GID, /* no change */
 			.ctime	= NO_CHANGE_64,
 			.atime	= NO_CHANGE_64,
 			.mtime	= NO_CHANGE_64,
 			.device	= 0,
 		};
-		CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
-					pCifsFile->pid);
+		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid,
+				       cfile->pid);
 	}
 
 out:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
 
-/* Try to reacquire byte range locks that were released when session */
-/* to server was lost */
-static int cifs_relock_file(struct cifsFileInfo *cifsFile)
+static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
+
+/*
+ * Try to reacquire byte range locks that were released when session
+ * to server was lost.
+ */
+static int
+cifs_relock_file(struct cifsFileInfo *cfile)
 {
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	int rc = 0;
 
-/* BB list all locks open on this file and relock */
+	down_read(&cinode->lock_sem);
+	if (cinode->can_cache_brlcks) {
+		/* can cache locks - no need to relock */
+		up_read(&cinode->lock_sem);
+		return rc;
+	}
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		rc = cifs_push_posix_locks(cfile);
+	else
+		rc = tcon->ses->server->ops->push_mand_locks(cfile);
 
+	up_read(&cinode->lock_sem);
 	return rc;
 }
 
-static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
+static int
+cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 {
 	int rc = -EACCES;
-	int xid;
+	unsigned int xid;
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *tcon;
-	struct cifsInodeInfo *pCifsInode;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifsInodeInfo *cinode;
 	struct inode *inode;
 	char *full_path = NULL;
-	int desiredAccess;
+	int desired_access;
 	int disposition = FILE_OPEN;
-	__u16 netfid;
+	int create_options = CREATE_NOT_DIR;
+	struct cifs_open_parms oparms;
 
-	xid = GetXid();
-	mutex_lock(&pCifsFile->fh_mutex);
-	if (!pCifsFile->invalidHandle) {
-		mutex_unlock(&pCifsFile->fh_mutex);
+	xid = get_xid();
+	mutex_lock(&cfile->fh_mutex);
+	if (!cfile->invalidHandle) {
+		mutex_unlock(&cfile->fh_mutex);
 		rc = 0;
-		FreeXid(xid);
+		free_xid(xid);
 		return rc;
 	}
 
-	inode = pCifsFile->dentry->d_inode;
+	inode = cfile->dentry->d_inode;
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = tlink_tcon(pCifsFile->tlink);
+	tcon = tlink_tcon(cfile->tlink);
+	server = tcon->ses->server;
 
-/* can not grab rename sem here because various ops, including
-   those that already have the rename sem can end up causing writepage
-   to get called and if the server was down that means we end up here,
-   and we can never tell if the caller already has the rename_sem */
-	full_path = build_path_from_dentry(pCifsFile->dentry);
+	/*
+	 * Can not grab rename sem here because various ops, including those
+	 * that already have the rename sem can end up causing writepage to get
+	 * called and if the server was down that means we end up here, and we
+	 * can never tell if the caller already has the rename_sem.
+	 */
+	full_path = build_path_from_dentry(cfile->dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		mutex_unlock(&pCifsFile->fh_mutex);
-		FreeXid(xid);
+		mutex_unlock(&cfile->fh_mutex);
+		free_xid(xid);
 		return rc;
 	}
 
-	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-		 inode, pCifsFile->f_flags, full_path);
+	cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n",
+		 inode, cfile->f_flags, full_path);
 
-	if (oplockEnabled)
+	if (tcon->ses->server->oplocks)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
 
-	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	if (tcon->unix_ext && cap_unix(tcon->ses) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
-			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
 		/*
 		 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
 		 * original open. Must mask them off for a reopen.
 		 */
-		unsigned int oflags = pCifsFile->f_flags &
+		unsigned int oflags = cfile->f_flags &
 						~(O_CREAT | O_EXCL | O_TRUNC);
 
 		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
-				cifs_sb->mnt_file_mode /* ignored */,
-				oflags, &oplock, &netfid, xid);
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &cfile->fid.netfid, xid);
 		if (rc == 0) {
-			cFYI(1, "posix reopen succeeded");
+			cifs_dbg(FYI, "posix reopen succeeded\n");
+			oparms.reconnect = true;
 			goto reopen_success;
 		}
-		/* fallthrough to retry open the old way on errors, especially
-		   in the reconnect path it is important to retry hard */
+		/*
+		 * fallthrough to retry open the old way on errors, especially
+		 * in the reconnect path it is important to retry hard
+		 */
 	}
 
-	desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
+	desired_access = cifs_convert_flags(cfile->f_flags);
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	if (server->ops->get_lease_key)
+		server->ops->get_lease_key(inode, &cfile->fid);
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = desired_access;
+	oparms.create_options = create_options;
+	oparms.disposition = disposition;
+	oparms.path = full_path;
+	oparms.fid = &cfile->fid;
+	oparms.reconnect = true;
 
-	/* Can not refresh inode by passing in file_info buf to be returned
-	   by SMBOpen and then calling get_inode_info with returned buf
-	   since file might have write behind data that needs to be flushed
-	   and server version of file size can be stale. If we knew for sure
-	   that inode was not dirty locally we could do this */
+	/*
+	 * Can not refresh inode by passing in file_info buf to be returned by
+	 * ops->open and then calling get_inode_info with returned buf since
+	 * file might have write behind data that needs to be flushed and server
+	 * version of file size can be stale. If we knew for sure that inode was
+	 * not dirty locally we could do this.
+	 */
+	rc = server->ops->open(xid, &oparms, &oplock, NULL);
+	if (rc == -ENOENT && oparms.reconnect == false) {
+		/* durable handle timeout is expired - open the file again */
+		rc = server->ops->open(xid, &oparms, &oplock, NULL);
+		/* indicate that we need to relock the file */
+		oparms.reconnect = true;
+	}
 
-	rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
-			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		mutex_unlock(&pCifsFile->fh_mutex);
-		cFYI(1, "cifs_open returned 0x%x", rc);
-		cFYI(1, "oplock: %d", oplock);
+		mutex_unlock(&cfile->fh_mutex);
+		cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
+		cifs_dbg(FYI, "oplock: %d\n", oplock);
 		goto reopen_error_exit;
 	}
 
 reopen_success:
-	pCifsFile->netfid = netfid;
-	pCifsFile->invalidHandle = false;
-	mutex_unlock(&pCifsFile->fh_mutex);
-	pCifsInode = CIFS_I(inode);
+	cfile->invalidHandle = false;
+	mutex_unlock(&cfile->fh_mutex);
+	cinode = CIFS_I(inode);
 
 	if (can_flush) {
 		rc = filemap_write_and_wait(inode->i_mapping);
 		mapping_set_error(inode->i_mapping, rc);
 
 		if (tcon->unix_ext)
-			rc = cifs_get_inode_info_unix(&inode,
-				full_path, inode->i_sb, xid);
+			rc = cifs_get_inode_info_unix(&inode, full_path,
+						      inode->i_sb, xid);
 		else
-			rc = cifs_get_inode_info(&inode,
-				full_path, NULL, inode->i_sb,
-				xid, NULL);
-	} /* else we are writing out data to server already
-	     and could deadlock if we tried to flush data, and
-	     since we do not know if we have data that would
-	     invalidate the current end of file on the server
-	     we can not go to the server to get the new inod
-	     info */
-
-	cifs_set_oplock_level(pCifsInode, oplock);
+			rc = cifs_get_inode_info(&inode, full_path, NULL,
+						 inode->i_sb, xid, NULL);
+	}
+	/*
+	 * Else we are writing out data to server already and could deadlock if
+	 * we tried to flush data, and since we do not know if we have data that
+	 * would invalidate the current end of file on the server we can not go
+	 * to the server to get the new inode info.
+	 */
 
-	cifs_relock_file(pCifsFile);
+	server->ops->set_fid(cfile, &cfile->fid, oplock);
+	if (oparms.reconnect)
+		cifs_relock_file(cfile);
 
 reopen_error_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
 int cifs_close(struct inode *inode, struct file *file)
 {
-	cifsFileInfo_put(file->private_data);
-	file->private_data = NULL;
+	if (file->private_data != NULL) {
+		cifsFileInfo_put(file->private_data);
+		file->private_data = NULL;
+	}
 
 	/* return code from the ->release op is always ignored */
 	return 0;
@@ -585,391 +745,921 @@ int cifs_close(struct inode *inode, struct file *file)
 int cifs_closedir(struct inode *inode, struct file *file)
 {
 	int rc = 0;
-	int xid;
-	struct cifsFileInfo *pCFileStruct = file->private_data;
-	char *ptmp;
+	unsigned int xid;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char *buf;
 
-	cFYI(1, "Closedir inode = 0x%p", inode);
+	cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode);
 
-	xid = GetXid();
+	if (cfile == NULL)
+		return rc;
 
-	if (pCFileStruct) {
-		struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
+	xid = get_xid();
+	tcon = tlink_tcon(cfile->tlink);
+	server = tcon->ses->server;
 
-		cFYI(1, "Freeing private data in close dir");
-		spin_lock(&cifs_file_list_lock);
-		if (!pCFileStruct->srch_inf.endOfSearch &&
-		    !pCFileStruct->invalidHandle) {
-			pCFileStruct->invalidHandle = true;
-			spin_unlock(&cifs_file_list_lock);
-			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
-			cFYI(1, "Closing uncompleted readdir with rc %d",
-				 rc);
-			/* not much we can do if it fails anyway, ignore rc */
-			rc = 0;
-		} else
-			spin_unlock(&cifs_file_list_lock);
-		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
-		if (ptmp) {
-			cFYI(1, "closedir free smb buf in srch struct");
-			pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
-			if (pCFileStruct->srch_inf.smallBuf)
-				cifs_small_buf_release(ptmp);
-			else
-				cifs_buf_release(ptmp);
-		}
-		cifs_put_tlink(pCFileStruct->tlink);
-		kfree(file->private_data);
-		file->private_data = NULL;
+	cifs_dbg(FYI, "Freeing private data in close dir\n");
+	spin_lock(&cifs_file_list_lock);
+	if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+		cfile->invalidHandle = true;
+		spin_unlock(&cifs_file_list_lock);
+		if (server->ops->close_dir)
+			rc = server->ops->close_dir(xid, tcon, &cfile->fid);
+		else
+			rc = -ENOSYS;
+		cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc);
+		/* not much we can do if it fails anyway, ignore rc */
+		rc = 0;
+	} else
+		spin_unlock(&cifs_file_list_lock);
+
+	buf = cfile->srch_inf.ntwrk_buf_start;
+	if (buf) {
+		cifs_dbg(FYI, "closedir free smb buf in srch struct\n");
+		cfile->srch_inf.ntwrk_buf_start = NULL;
+		if (cfile->srch_inf.smallBuf)
+			cifs_small_buf_release(buf);
+		else
+			cifs_buf_release(buf);
 	}
+
+	cifs_put_tlink(cfile->tlink);
+	kfree(file->private_data);
+	file->private_data = NULL;
 	/* BB can we lock the filestruct while this is going on? */
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
-static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
-				__u64 offset, __u8 lockType)
+static struct cifsLockInfo *
+cifs_lock_init(__u64 offset, __u64 length, __u8 type)
 {
-	struct cifsLockInfo *li =
+	struct cifsLockInfo *lock =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
-	if (li == NULL)
-		return -ENOMEM;
-	li->offset = offset;
-	li->length = len;
-	li->type = lockType;
-	mutex_lock(&fid->lock_mutex);
-	list_add(&li->llist, &fid->llist);
-	mutex_unlock(&fid->lock_mutex);
-	return 0;
+	if (!lock)
+		return lock;
+	lock->offset = offset;
+	lock->length = length;
+	lock->type = type;
+	lock->pid = current->tgid;
+	INIT_LIST_HEAD(&lock->blist);
+	init_waitqueue_head(&lock->block_q);
+	return lock;
 }
 
-int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
+void
+cifs_del_lock_waiters(struct cifsLockInfo *lock)
 {
-	int rc, xid;
-	__u32 numLock = 0;
-	__u32 numUnlock = 0;
+	struct cifsLockInfo *li, *tmp;
+	list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
+		list_del_init(&li->blist);
+		wake_up(&li->block_q);
+	}
+}
+
+#define CIFS_LOCK_OP	0
+#define CIFS_READ_OP	1
+#define CIFS_WRITE_OP	2
+
+/* @rw_check : 0 - no op, 1 - read, 2 - write */
+static bool
+cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
+			    __u64 length, __u8 type, struct cifsFileInfo *cfile,
+			    struct cifsLockInfo **conf_lock, int rw_check)
+{
+	struct cifsLockInfo *li;
+	struct cifsFileInfo *cur_cfile = fdlocks->cfile;
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+
+	list_for_each_entry(li, &fdlocks->locks, llist) {
+		if (offset + length <= li->offset ||
+		    offset >= li->offset + li->length)
+			continue;
+		if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
+		    server->ops->compare_fids(cfile, cur_cfile)) {
+			/* shared lock prevents write op through the same fid */
+			if (!(li->type & server->vals->shared_lock_type) ||
+			    rw_check != CIFS_WRITE_OP)
+				continue;
+		}
+		if ((type & server->vals->shared_lock_type) &&
+		    ((server->ops->compare_fids(cfile, cur_cfile) &&
+		     current->tgid == li->pid) || type == li->type))
+			continue;
+		if (conf_lock)
+			*conf_lock = li;
+		return true;
+	}
+	return false;
+}
+
+bool
+cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+			__u8 type, struct cifsLockInfo **conf_lock,
+			int rw_check)
+{
+	bool rc = false;
+	struct cifs_fid_locks *cur;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+
+	list_for_each_entry(cur, &cinode->llist, llist) {
+		rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
+						 cfile, conf_lock, rw_check);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+/*
+ * Check if there is another lock that prevents us to set the lock (mandatory
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
+static int
+cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+	       __u8 type, struct file_lock *flock)
+{
+	int rc = 0;
+	struct cifsLockInfo *conf_lock;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+	bool exist;
+
+	down_read(&cinode->lock_sem);
+
+	exist = cifs_find_lock_conflict(cfile, offset, length, type,
+					&conf_lock, CIFS_LOCK_OP);
+	if (exist) {
+		flock->fl_start = conf_lock->offset;
+		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
+		flock->fl_pid = conf_lock->pid;
+		if (conf_lock->type & server->vals->shared_lock_type)
+			flock->fl_type = F_RDLCK;
+		else
+			flock->fl_type = F_WRLCK;
+	} else if (!cinode->can_cache_brlcks)
+		rc = 1;
+	else
+		flock->fl_type = F_UNLCK;
+
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+static void
+cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	down_write(&cinode->lock_sem);
+	list_add_tail(&lock->llist, &cfile->llist->locks);
+	up_write(&cinode->lock_sem);
+}
+
+/*
+ * Set the byte-range lock (mandatory style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if no locks prevent us but we need to request to the server;
+ * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ */
+static int
+cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
+		 bool wait)
+{
+	struct cifsLockInfo *conf_lock;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	bool exist;
+	int rc = 0;
+
+try_again:
+	exist = false;
+	down_write(&cinode->lock_sem);
+
+	exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
+					lock->type, &conf_lock, CIFS_LOCK_OP);
+	if (!exist && cinode->can_cache_brlcks) {
+		list_add_tail(&lock->llist, &cfile->llist->locks);
+		up_write(&cinode->lock_sem);
+		return rc;
+	}
+
+	if (!exist)
+		rc = 1;
+	else if (!wait)
+		rc = -EACCES;
+	else {
+		list_add_tail(&lock->blist, &conf_lock->blist);
+		up_write(&cinode->lock_sem);
+		rc = wait_event_interruptible(lock->block_q,
+					(lock->blist.prev == &lock->blist) &&
+					(lock->blist.next == &lock->blist));
+		if (!rc)
+			goto try_again;
+		down_write(&cinode->lock_sem);
+		list_del_init(&lock->blist);
+	}
+
+	up_write(&cinode->lock_sem);
+	return rc;
+}
+
+/*
+ * Check if there is another lock that prevents us to set the lock (posix
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
+static int
+cifs_posix_lock_test(struct file *file, struct file_lock *flock)
+{
+	int rc = 0;
+	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
+	unsigned char saved_type = flock->fl_type;
+
+	if ((flock->fl_flags & FL_POSIX) == 0)
+		return 1;
+
+	down_read(&cinode->lock_sem);
+	posix_test_lock(file, flock);
+
+	if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
+		flock->fl_type = saved_type;
+		rc = 1;
+	}
+
+	up_read(&cinode->lock_sem);
+	return rc;
+}
+
+/*
+ * Set the byte-range lock (posix style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if we need to request to the server;
+ * 3) <0, if the error occurs while setting the lock.
+ */
+static int
+cifs_posix_lock_set(struct file *file, struct file_lock *flock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
+	int rc = 1;
+
+	if ((flock->fl_flags & FL_POSIX) == 0)
+		return rc;
+
+try_again:
+	down_write(&cinode->lock_sem);
+	if (!cinode->can_cache_brlcks) {
+		up_write(&cinode->lock_sem);
+		return rc;
+	}
+
+	rc = posix_lock_file(file, flock, NULL);
+	up_write(&cinode->lock_sem);
+	if (rc == FILE_LOCK_DEFERRED) {
+		rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
+		if (!rc)
+			goto try_again;
+		posix_unblock_lock(flock);
+	}
+	return rc;
+}
+
+int
+cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+	unsigned int xid;
+	int rc = 0, stored_rc;
+	struct cifsLockInfo *li, *tmp;
+	struct cifs_tcon *tcon;
+	unsigned int num, max_num, max_buf;
+	LOCKING_ANDX_RANGE *buf, *cur;
+	int types[] = {LOCKING_ANDX_LARGE_FILES,
+		       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+	int i;
+
+	xid = get_xid();
+	tcon = tlink_tcon(cfile->tlink);
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tcon->ses->server->maxBuf;
+	if (!max_buf) {
+		free_xid(xid);
+		return -EINVAL;
+	}
+
+	max_num = (max_buf - sizeof(struct smb_hdr)) /
+						sizeof(LOCKING_ANDX_RANGE);
+	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+	if (!buf) {
+		free_xid(xid);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < 2; i++) {
+		cur = buf;
+		num = 0;
+		list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
+			if (li->type != types[i])
+				continue;
+			cur->Pid = cpu_to_le16(li->pid);
+			cur->LengthLow = cpu_to_le32((u32)li->length);
+			cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+			cur->OffsetLow = cpu_to_le32((u32)li->offset);
+			cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+			if (++num == max_num) {
+				stored_rc = cifs_lockv(xid, tcon,
+						       cfile->fid.netfid,
+						       (__u8)li->type, 0, num,
+						       buf);
+				if (stored_rc)
+					rc = stored_rc;
+				cur = buf;
+				num = 0;
+			} else
+				cur++;
+		}
+
+		if (num) {
+			stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid,
+					       (__u8)types[i], 0, num, buf);
+			if (stored_rc)
+				rc = stored_rc;
+		}
+	}
+
+	kfree(buf);
+	free_xid(xid);
+	return rc;
+}
+
+/* copied from fs/locks.c with a name change */
+#define cifs_for_each_lock(inode, lockp) \
+	for (lockp = &inode->i_flock; *lockp != NULL; \
+	     lockp = &(*lockp)->fl_next)
+
+struct lock_to_push {
+	struct list_head llist;
+	__u64 offset;
 	__u64 length;
-	bool wait_flag = false;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *tcon;
+	__u32 pid;
 	__u16 netfid;
-	__u8 lockType = LOCKING_ANDX_LARGE_FILES;
-	bool posix_locking = 0;
+	__u8 type;
+};
 
-	length = 1 + pfLock->fl_end - pfLock->fl_start;
-	rc = -EACCES;
-	xid = GetXid();
-
-	cFYI(1, "Lock parm: 0x%x flockflags: "
-		 "0x%x flocktype: 0x%x start: %lld end: %lld",
-		cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-		pfLock->fl_end);
-
-	if (pfLock->fl_flags & FL_POSIX)
-		cFYI(1, "Posix");
-	if (pfLock->fl_flags & FL_FLOCK)
-		cFYI(1, "Flock");
-	if (pfLock->fl_flags & FL_SLEEP) {
-		cFYI(1, "Blocking lock");
-		wait_flag = true;
-	}
-	if (pfLock->fl_flags & FL_ACCESS)
-		cFYI(1, "Process suspended by mandatory locking - "
-			 "not implemented yet");
-	if (pfLock->fl_flags & FL_LEASE)
-		cFYI(1, "Lease on file - not implemented yet");
-	if (pfLock->fl_flags &
-	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-		cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
-
-	if (pfLock->fl_type == F_WRLCK) {
-		cFYI(1, "F_WRLCK ");
-		numLock = 1;
-	} else if (pfLock->fl_type == F_UNLCK) {
-		cFYI(1, "F_UNLCK");
-		numUnlock = 1;
-		/* Check if unlock includes more than
-		one lock range */
-	} else if (pfLock->fl_type == F_RDLCK) {
-		cFYI(1, "F_RDLCK");
-		lockType |= LOCKING_ANDX_SHARED_LOCK;
-		numLock = 1;
-	} else if (pfLock->fl_type == F_EXLCK) {
-		cFYI(1, "F_EXLCK");
-		numLock = 1;
-	} else if (pfLock->fl_type == F_SHLCK) {
-		cFYI(1, "F_SHLCK");
-		lockType |= LOCKING_ANDX_SHARED_LOCK;
-		numLock = 1;
-	} else
-		cFYI(1, "Unknown type of lock");
+static int
+cifs_push_posix_locks(struct cifsFileInfo *cfile)
+{
+	struct inode *inode = cfile->dentry->d_inode;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct file_lock *flock, **before;
+	unsigned int count = 0, i = 0;
+	int rc = 0, xid, type;
+	struct list_head locks_to_send, *el;
+	struct lock_to_push *lck, *tmp;
+	__u64 length;
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-	netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
+	xid = get_xid();
 
-	if ((tcon->ses->capabilities & CAP_UNIX) &&
-	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
-	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-		posix_locking = 1;
-	/* BB add code here to normalize offset and length to
-	account for negative length which we can not accept over the
-	wire */
-	if (IS_GETLK(cmd)) {
-		if (posix_locking) {
-			int posix_lock_type;
-			if (lockType & LOCKING_ANDX_SHARED_LOCK)
-				posix_lock_type = CIFS_RDLCK;
-			else
-				posix_lock_type = CIFS_WRLCK;
-			rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
-					length,	pfLock,
-					posix_lock_type, wait_flag);
-			FreeXid(xid);
-			return rc;
-		}
+	spin_lock(&inode->i_lock);
+	cifs_for_each_lock(inode, before) {
+		if ((*before)->fl_flags & FL_POSIX)
+			count++;
+	}
+	spin_unlock(&inode->i_lock);
 
-		/* BB we could chain these into one lock request BB */
-		rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
-				 0, 1, lockType, 0 /* wait flag */, 0);
-		if (rc == 0) {
-			rc = CIFSSMBLock(xid, tcon, netfid, length,
-					 pfLock->fl_start, 1 /* numUnlock */ ,
-					 0 /* numLock */ , lockType,
-					 0 /* wait flag */, 0);
-			pfLock->fl_type = F_UNLCK;
-			if (rc != 0)
-				cERROR(1, "Error unlocking previously locked "
-					   "range %d during test of lock", rc);
-			rc = 0;
+	INIT_LIST_HEAD(&locks_to_send);
 
-		} else {
-			/* if rc == ERR_SHARING_VIOLATION ? */
-			rc = 0;
+	/*
+	 * Allocating count locks is enough because no FL_POSIX locks can be
+	 * added to the list while we are holding cinode->lock_sem that
+	 * protects locking operations of this inode.
+	 */
+	for (; i < count; i++) {
+		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
+		if (!lck) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+		list_add_tail(&lck->llist, &locks_to_send);
+	}
 
-			if (lockType & LOCKING_ANDX_SHARED_LOCK) {
-				pfLock->fl_type = F_WRLCK;
-			} else {
-				rc = CIFSSMBLock(xid, tcon, netfid, length,
-					pfLock->fl_start, 0, 1,
-					lockType | LOCKING_ANDX_SHARED_LOCK,
-					0 /* wait flag */, 0);
-				if (rc == 0) {
-					rc = CIFSSMBLock(xid, tcon, netfid,
-						length, pfLock->fl_start, 1, 0,
-						lockType |
-						LOCKING_ANDX_SHARED_LOCK,
-						0 /* wait flag */, 0);
-					pfLock->fl_type = F_RDLCK;
-					if (rc != 0)
-						cERROR(1, "Error unlocking "
-						"previously locked range %d "
-						"during test of lock", rc);
-					rc = 0;
-				} else {
-					pfLock->fl_type = F_WRLCK;
-					rc = 0;
-				}
-			}
+	el = locks_to_send.next;
+	spin_lock(&inode->i_lock);
+	cifs_for_each_lock(inode, before) {
+		flock = *before;
+		if ((flock->fl_flags & FL_POSIX) == 0)
+			continue;
+		if (el == &locks_to_send) {
+			/*
+			 * The list ended. We don't have enough allocated
+			 * structures - something is really wrong.
+			 */
+			cifs_dbg(VFS, "Can't push all brlocks!\n");
+			break;
 		}
+		length = 1 + flock->fl_end - flock->fl_start;
+		if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+			type = CIFS_RDLCK;
+		else
+			type = CIFS_WRLCK;
+		lck = list_entry(el, struct lock_to_push, llist);
+		lck->pid = flock->fl_pid;
+		lck->netfid = cfile->fid.netfid;
+		lck->length = length;
+		lck->type = type;
+		lck->offset = flock->fl_start;
+		el = el->next;
+	}
+	spin_unlock(&inode->i_lock);
+
+	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+		int stored_rc;
+
+		stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
+					     lck->offset, lck->length, NULL,
+					     lck->type, 0);
+		if (stored_rc)
+			rc = stored_rc;
+		list_del(&lck->llist);
+		kfree(lck);
+	}
+
+out:
+	free_xid(xid);
+	return rc;
+err_out:
+	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+		list_del(&lck->llist);
+		kfree(lck);
+	}
+	goto out;
+}
+
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	int rc = 0;
 
-		FreeXid(xid);
+	/* we are going to update can_cache_brlcks here - need a write access */
+	down_write(&cinode->lock_sem);
+	if (!cinode->can_cache_brlcks) {
+		up_write(&cinode->lock_sem);
 		return rc;
 	}
 
-	if (!numLock && !numUnlock) {
-		/* if no lock or unlock then nothing
-		to do since we do not know what it is */
-		FreeXid(xid);
-		return -EOPNOTSUPP;
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		rc = cifs_push_posix_locks(cfile);
+	else
+		rc = tcon->ses->server->ops->push_mand_locks(cfile);
+
+	cinode->can_cache_brlcks = false;
+	up_write(&cinode->lock_sem);
+	return rc;
+}
+
+static void
+cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
+		bool *wait_flag, struct TCP_Server_Info *server)
+{
+	if (flock->fl_flags & FL_POSIX)
+		cifs_dbg(FYI, "Posix\n");
+	if (flock->fl_flags & FL_FLOCK)
+		cifs_dbg(FYI, "Flock\n");
+	if (flock->fl_flags & FL_SLEEP) {
+		cifs_dbg(FYI, "Blocking lock\n");
+		*wait_flag = true;
 	}
+	if (flock->fl_flags & FL_ACCESS)
+		cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
+	if (flock->fl_flags & FL_LEASE)
+		cifs_dbg(FYI, "Lease on file - not implemented yet\n");
+	if (flock->fl_flags &
+	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP |
+	       FL_ACCESS | FL_LEASE | FL_CLOSE)))
+		cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
+
+	*type = server->vals->large_lock_type;
+	if (flock->fl_type == F_WRLCK) {
+		cifs_dbg(FYI, "F_WRLCK\n");
+		*type |= server->vals->exclusive_lock_type;
+		*lock = 1;
+	} else if (flock->fl_type == F_UNLCK) {
+		cifs_dbg(FYI, "F_UNLCK\n");
+		*type |= server->vals->unlock_lock_type;
+		*unlock = 1;
+		/* Check if unlock includes more than one lock range */
+	} else if (flock->fl_type == F_RDLCK) {
+		cifs_dbg(FYI, "F_RDLCK\n");
+		*type |= server->vals->shared_lock_type;
+		*lock = 1;
+	} else if (flock->fl_type == F_EXLCK) {
+		cifs_dbg(FYI, "F_EXLCK\n");
+		*type |= server->vals->exclusive_lock_type;
+		*lock = 1;
+	} else if (flock->fl_type == F_SHLCK) {
+		cifs_dbg(FYI, "F_SHLCK\n");
+		*type |= server->vals->shared_lock_type;
+		*lock = 1;
+	} else
+		cifs_dbg(FYI, "Unknown type of lock\n");
+}
+
+static int
+cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
+	   bool wait_flag, bool posix_lck, unsigned int xid)
+{
+	int rc = 0;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	__u16 netfid = cfile->fid.netfid;
 
-	if (posix_locking) {
+	if (posix_lck) {
 		int posix_lock_type;
-		if (lockType & LOCKING_ANDX_SHARED_LOCK)
+
+		rc = cifs_posix_lock_test(file, flock);
+		if (!rc)
+			return rc;
+
+		if (type & server->vals->shared_lock_type)
 			posix_lock_type = CIFS_RDLCK;
 		else
 			posix_lock_type = CIFS_WRLCK;
-
-		if (numUnlock == 1)
-			posix_lock_type = CIFS_UNLCK;
-
-		rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
-				      length, pfLock,
+		rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
+				      flock->fl_start, length, flock,
 				      posix_lock_type, wait_flag);
-	} else {
-		struct cifsFileInfo *fid = file->private_data;
+		return rc;
+	}
 
-		if (numLock) {
-			rc = CIFSSMBLock(xid, tcon, netfid, length,
-					 pfLock->fl_start, 0, numLock, lockType,
-					 wait_flag, 0);
+	rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock);
+	if (!rc)
+		return rc;
 
-			if (rc == 0) {
-				/* For Windows locks we must store them. */
-				rc = store_file_lock(fid, length,
-						pfLock->fl_start, lockType);
-			}
-		} else if (numUnlock) {
-			/* For each stored lock that this unlock overlaps
-			   completely, unlock it. */
-			int stored_rc = 0;
-			struct cifsLockInfo *li, *tmp;
+	/* BB we could chain these into one lock request BB */
+	rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type,
+				    1, 0, false);
+	if (rc == 0) {
+		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+					    type, 0, 1, false);
+		flock->fl_type = F_UNLCK;
+		if (rc != 0)
+			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+				 rc);
+		return 0;
+	}
 
-			rc = 0;
-			mutex_lock(&fid->lock_mutex);
-			list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
-				if (pfLock->fl_start <= li->offset &&
-						(pfLock->fl_start + length) >=
-						(li->offset + li->length)) {
-					stored_rc = CIFSSMBLock(xid, tcon,
-							netfid, li->length,
-							li->offset, 1, 0,
-							li->type, false, 0);
-					if (stored_rc)
-						rc = stored_rc;
-					else {
-						list_del(&li->llist);
-						kfree(li);
-					}
-				}
-			}
-			mutex_unlock(&fid->lock_mutex);
-		}
+	if (type & server->vals->shared_lock_type) {
+		flock->fl_type = F_WRLCK;
+		return 0;
 	}
 
-	if (pfLock->fl_flags & FL_POSIX)
-		posix_lock_file_wait(file, pfLock);
-	FreeXid(xid);
-	return rc;
+	type &= ~server->vals->exclusive_lock_type;
+
+	rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+				    type | server->vals->shared_lock_type,
+				    1, 0, false);
+	if (rc == 0) {
+		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+			type | server->vals->shared_lock_type, 0, 1, false);
+		flock->fl_type = F_RDLCK;
+		if (rc != 0)
+			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+				 rc);
+	} else
+		flock->fl_type = F_WRLCK;
+
+	return 0;
 }
 
-/* update the file size (if needed) after a write */
 void
-cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
-		      unsigned int bytes_written)
+cifs_move_llist(struct list_head *source, struct list_head *dest)
 {
-	loff_t end_of_write = offset + bytes_written;
+	struct list_head *li, *tmp;
+	list_for_each_safe(li, tmp, source)
+		list_move(li, dest);
+}
 
-	if (end_of_write > cifsi->server_eof)
-		cifsi->server_eof = end_of_write;
+void
+cifs_free_llist(struct list_head *llist)
+{
+	struct cifsLockInfo *li, *tmp;
+	list_for_each_entry_safe(li, tmp, llist, llist) {
+		cifs_del_lock_waiters(li);
+		list_del(&li->llist);
+		kfree(li);
+	}
+}
+
+int
+cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
+		  unsigned int xid)
+{
+	int rc = 0, stored_rc;
+	int types[] = {LOCKING_ANDX_LARGE_FILES,
+		       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+	unsigned int i;
+	unsigned int max_num, num, max_buf;
+	LOCKING_ANDX_RANGE *buf, *cur;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifsLockInfo *li, *tmp;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct list_head tmp_llist;
+
+	INIT_LIST_HEAD(&tmp_llist);
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tcon->ses->server->maxBuf;
+	if (!max_buf)
+		return -EINVAL;
+
+	max_num = (max_buf - sizeof(struct smb_hdr)) /
+						sizeof(LOCKING_ANDX_RANGE);
+	buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	down_write(&cinode->lock_sem);
+	for (i = 0; i < 2; i++) {
+		cur = buf;
+		num = 0;
+		list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
+			if (flock->fl_start > li->offset ||
+			    (flock->fl_start + length) <
+			    (li->offset + li->length))
+				continue;
+			if (current->tgid != li->pid)
+				continue;
+			if (types[i] != li->type)
+				continue;
+			if (cinode->can_cache_brlcks) {
+				/*
+				 * We can cache brlock requests - simply remove
+				 * a lock from the file's list.
+				 */
+				list_del(&li->llist);
+				cifs_del_lock_waiters(li);
+				kfree(li);
+				continue;
+			}
+			cur->Pid = cpu_to_le16(li->pid);
+			cur->LengthLow = cpu_to_le32((u32)li->length);
+			cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+			cur->OffsetLow = cpu_to_le32((u32)li->offset);
+			cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+			/*
+			 * We need to save a lock here to let us add it again to
+			 * the file's list if the unlock range request fails on
+			 * the server.
+			 */
+			list_move(&li->llist, &tmp_llist);
+			if (++num == max_num) {
+				stored_rc = cifs_lockv(xid, tcon,
+						       cfile->fid.netfid,
+						       li->type, num, 0, buf);
+				if (stored_rc) {
+					/*
+					 * We failed on the unlock range
+					 * request - add all locks from the tmp
+					 * list to the head of the file's list.
+					 */
+					cifs_move_llist(&tmp_llist,
+							&cfile->llist->locks);
+					rc = stored_rc;
+				} else
+					/*
+					 * The unlock range request succeed -
+					 * free the tmp list.
+					 */
+					cifs_free_llist(&tmp_llist);
+				cur = buf;
+				num = 0;
+			} else
+				cur++;
+		}
+		if (num) {
+			stored_rc = cifs_lockv(xid, tcon, cfile->fid.netfid,
+					       types[i], num, 0, buf);
+			if (stored_rc) {
+				cifs_move_llist(&tmp_llist,
+						&cfile->llist->locks);
+				rc = stored_rc;
+			} else
+				cifs_free_llist(&tmp_llist);
+		}
+	}
+
+	up_write(&cinode->lock_sem);
+	kfree(buf);
+	return rc;
 }
 
-ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-	size_t write_size, loff_t *poffset)
+static int
+cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
+	   bool wait_flag, bool posix_lck, int lock, int unlock,
+	   unsigned int xid)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	int rc = 0;
-	unsigned int bytes_written = 0;
-	unsigned int total_written;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	int xid;
-	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct inode *inode = cfile->dentry->d_inode;
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	if (posix_lck) {
+		int posix_lock_type;
 
-	/* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name); */
+		rc = cifs_posix_lock_set(file, flock);
+		if (!rc || rc < 0)
+			return rc;
 
-	if (file->private_data == NULL)
-		return -EBADF;
+		if (type & server->vals->shared_lock_type)
+			posix_lock_type = CIFS_RDLCK;
+		else
+			posix_lock_type = CIFS_WRLCK;
 
-	open_file = file->private_data;
-	pTcon = tlink_tcon(open_file->tlink);
+		if (unlock == 1)
+			posix_lock_type = CIFS_UNLCK;
 
-	rc = generic_write_checks(file, poffset, &write_size, 0);
-	if (rc)
-		return rc;
+		rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
+				      current->tgid, flock->fl_start, length,
+				      NULL, posix_lock_type, wait_flag);
+		goto out;
+	}
 
-	xid = GetXid();
+	if (lock) {
+		struct cifsLockInfo *lock;
 
-	for (total_written = 0; write_size > total_written;
-	     total_written += bytes_written) {
-		rc = -EAGAIN;
-		while (rc == -EAGAIN) {
-			if (file->private_data == NULL) {
-				/* file has been closed on us */
-				FreeXid(xid);
-			/* if we have gotten here we have written some data
-			   and blocked, and the file has been freed on us while
-			   we blocked so return what we managed to write */
-				return total_written;
-			}
-			if (open_file->invalidHandle) {
-				/* we could deadlock if we called
-				   filemap_fdatawait from here so tell
-				   reopen_file not to flush data to server
-				   now */
-				rc = cifs_reopen_file(open_file, false);
-				if (rc != 0)
-					break;
-			}
+		lock = cifs_lock_init(flock->fl_start, length, type);
+		if (!lock)
+			return -ENOMEM;
 
-			rc = CIFSSMBWrite(xid, pTcon,
-				open_file->netfid,
-				min_t(const int, cifs_sb->wsize,
-				      write_size - total_written),
-				*poffset, &bytes_written,
-				NULL, write_data + total_written, 0);
+		rc = cifs_lock_add_if(cfile, lock, wait_flag);
+		if (rc < 0) {
+			kfree(lock);
+			return rc;
 		}
-		if (rc || (bytes_written == 0)) {
-			if (total_written)
-				break;
-			else {
-				FreeXid(xid);
-				return rc;
-			}
-		} else {
-			cifs_update_eof(cifsi, *poffset, bytes_written);
-			*poffset += bytes_written;
+		if (!rc)
+			goto out;
+
+		/*
+		 * Windows 7 server can delay breaking lease from read to None
+		 * if we set a byte-range lock on a file - break it explicitly
+		 * before sending the lock to the server to be sure the next
+		 * read won't conflict with non-overlapted locks due to
+		 * pagereading.
+		 */
+		if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
+					CIFS_CACHE_READ(CIFS_I(inode))) {
+			cifs_zap_mapping(inode);
+			cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
+				 inode);
+			CIFS_I(inode)->oplock = 0;
+		}
+
+		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
+					    type, 1, 0, wait_flag);
+		if (rc) {
+			kfree(lock);
+			return rc;
 		}
-	}
 
-	cifs_stats_bytes_written(pTcon, total_written);
+		cifs_lock_add(cfile, lock);
+	} else if (unlock)
+		rc = server->ops->mand_unlock_range(cfile, flock, xid);
 
-/* Do not update local mtime - server will set its actual value on write
- *	inode->i_ctime = inode->i_mtime =
- * 		current_fs_time(inode->i_sb);*/
-	if (total_written > 0) {
-		spin_lock(&inode->i_lock);
-		if (*poffset > inode->i_size)
-			i_size_write(inode, *poffset);
-		spin_unlock(&inode->i_lock);
+out:
+	if (flock->fl_flags & FL_POSIX)
+		posix_lock_file_wait(file, flock);
+	return rc;
+}
+
+int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
+{
+	int rc, xid;
+	int lock = 0, unlock = 0;
+	bool wait_flag = false;
+	bool posix_lck = false;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *cinode;
+	struct cifsFileInfo *cfile;
+	__u16 netfid;
+	__u32 type;
+
+	rc = -EACCES;
+	xid = get_xid();
+
+	cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n",
+		 cmd, flock->fl_flags, flock->fl_type,
+		 flock->fl_start, flock->fl_end);
+
+	cfile = (struct cifsFileInfo *)file->private_data;
+	tcon = tlink_tcon(cfile->tlink);
+
+	cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
+			tcon->ses->server);
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	netfid = cfile->fid.netfid;
+	cinode = CIFS_I(file_inode(file));
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		posix_lck = true;
+	/*
+	 * BB add code here to normalize offset and length to account for
+	 * negative length which we can not accept over the wire.
+	 */
+	if (IS_GETLK(cmd)) {
+		rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
+		free_xid(xid);
+		return rc;
 	}
-	mark_inode_dirty_sync(inode);
 
-	FreeXid(xid);
-	return total_written;
+	if (!lock && !unlock) {
+		/*
+		 * if no lock or unlock then nothing to do since we do not
+		 * know what it is
+		 */
+		free_xid(xid);
+		return -EOPNOTSUPP;
+	}
+
+	rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
+			xid);
+	free_xid(xid);
+	return rc;
 }
 
-static ssize_t cifs_write(struct cifsFileInfo *open_file,
-			  const char *write_data, size_t write_size,
-			  loff_t *poffset)
+/*
+ * update the file size (if needed) after a write. Should be called with
+ * the inode->i_lock held
+ */
+void
+cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+		      unsigned int bytes_written)
+{
+	loff_t end_of_write = offset + bytes_written;
+
+	if (end_of_write > cifsi->server_eof)
+		cifsi->server_eof = end_of_write;
+}
+
+static ssize_t
+cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
+	   size_t write_size, loff_t *offset)
 {
 	int rc = 0;
 	unsigned int bytes_written = 0;
 	unsigned int total_written;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	int xid;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	unsigned int xid;
 	struct dentry *dentry = open_file->dentry;
 	struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
+	struct cifs_io_parms io_parms;
 
 	cifs_sb = CIFS_SB(dentry->d_sb);
 
-	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-	   *poffset, dentry->d_name.name);
+	cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n",
+		 write_size, *offset, dentry->d_name.name);
+
+	tcon = tlink_tcon(open_file->tlink);
+	server = tcon->ses->server;
 
-	pTcon = tlink_tcon(open_file->tlink);
+	if (!server->ops->sync_write)
+		return -ENOSYS;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
+			struct kvec iov[2];
+			unsigned int len;
+
 			if (open_file->invalidHandle) {
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
@@ -979,55 +1669,44 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
 				if (rc != 0)
 					break;
 			}
-			if (experimEnabled || (pTcon->ses->server &&
-				((pTcon->ses->server->secMode &
-				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-				== 0))) {
-				struct kvec iov[2];
-				unsigned int len;
-
-				len = min((size_t)cifs_sb->wsize,
-					  write_size - total_written);
-				/* iov[0] is reserved for smb header */
-				iov[1].iov_base = (char *)write_data +
-						  total_written;
-				iov[1].iov_len = len;
-				rc = CIFSSMBWrite2(xid, pTcon,
-						open_file->netfid, len,
-						*poffset, &bytes_written,
-						iov, 1, 0);
-			} else
-				rc = CIFSSMBWrite(xid, pTcon,
-					 open_file->netfid,
-					 min_t(const int, cifs_sb->wsize,
-					       write_size - total_written),
-					 *poffset, &bytes_written,
-					 write_data + total_written,
-					 NULL, 0);
+
+			len = min((size_t)cifs_sb->wsize,
+				  write_size - total_written);
+			/* iov[0] is reserved for smb header */
+			iov[1].iov_base = (char *)write_data + total_written;
+			iov[1].iov_len = len;
+			io_parms.pid = pid;
+			io_parms.tcon = tcon;
+			io_parms.offset = *offset;
+			io_parms.length = len;
+			rc = server->ops->sync_write(xid, open_file, &io_parms,
+						     &bytes_written, iov, 1);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
 				break;
 			else {
-				FreeXid(xid);
+				free_xid(xid);
 				return rc;
 			}
 		} else {
-			cifs_update_eof(cifsi, *poffset, bytes_written);
-			*poffset += bytes_written;
+			spin_lock(&dentry->d_inode->i_lock);
+			cifs_update_eof(cifsi, *offset, bytes_written);
+			spin_unlock(&dentry->d_inode->i_lock);
+			*offset += bytes_written;
 		}
 	}
 
-	cifs_stats_bytes_written(pTcon, total_written);
+	cifs_stats_bytes_written(tcon, total_written);
 
 	if (total_written > 0) {
 		spin_lock(&dentry->d_inode->i_lock);
-		if (*poffset > dentry->d_inode->i_size)
-			i_size_write(dentry->d_inode, *poffset);
+		if (*offset > dentry->d_inode->i_size)
+			i_size_write(dentry->d_inode, *offset);
 		spin_unlock(&dentry->d_inode->i_lock);
 	}
 	mark_inode_dirty_sync(dentry->d_inode);
-	FreeXid(xid);
+	free_xid(xid);
 	return total_written;
 }
 
@@ -1046,13 +1725,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 	   are always at the end of the list but since the first entry might
 	   have a close pending, we go through the whole list */
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (fsuid_only && open_file->uid != current_fsuid())
+		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
 			if (!open_file->invalidHandle) {
 				/* found a good file */
 				/* lock it so it will not be closed on us */
-				cifsFileInfo_get(open_file);
+				cifsFileInfo_get_locked(open_file);
 				spin_unlock(&cifs_file_list_lock);
 				return open_file;
 			} /* else might as well continue, and look for
@@ -1068,17 +1747,18 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 					bool fsuid_only)
 {
-	struct cifsFileInfo *open_file;
+	struct cifsFileInfo *open_file, *inv_file = NULL;
 	struct cifs_sb_info *cifs_sb;
 	bool any_available = false;
 	int rc;
+	unsigned int refind = 0;
 
 	/* Having a null inode here (because mapping->host was set to zero by
 	the VFS or MM) should not happen but we had reports of on oops (due to
 	it being zero) during stress testcases so we need to check for it */
 
 	if (cifs_inode == NULL) {
-		cERROR(1, "Null inode passed to cifs_writeable_file");
+		cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
 		dump_stack();
 		return NULL;
 	}
@@ -1091,40 +1771,25 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 
 	spin_lock(&cifs_file_list_lock);
 refind_writable:
+	if (refind > MAX_REOPEN_ATT) {
+		spin_unlock(&cifs_file_list_lock);
+		return NULL;
+	}
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (!any_available && open_file->pid != current->tgid)
 			continue;
-		if (fsuid_only && open_file->uid != current_fsuid())
+		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-			cifsFileInfo_get(open_file);
-
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
+				cifsFileInfo_get_locked(open_file);
 				spin_unlock(&cifs_file_list_lock);
 				return open_file;
+			} else {
+				if (!inv_file)
+					inv_file = open_file;
 			}
-
-			spin_unlock(&cifs_file_list_lock);
-
-			/* Had to unlock since following call can block */
-			rc = cifs_reopen_file(open_file, false);
-			if (!rc)
-				return open_file;
-
-			/* if it fails, try another handle if possible */
-			cFYI(1, "wp failed on reopen file");
-			cifsFileInfo_put(open_file);
-
-			spin_lock(&cifs_file_list_lock);
-
-			/* else we simply continue to the next entry. Thus
-			   we do not loop on reopen errors.  If we
-			   can not reopen the file, for example if we
-			   reconnected to a server with another client
-			   racing to delete or lock the file we would not
-			   make progress if we restarted before the beginning
-			   of the loop here. */
 		}
 	}
 	/* couldn't find useable FH with same pid, try any available */
@@ -1132,7 +1797,30 @@ refind_writable:
 		any_available = true;
 		goto refind_writable;
 	}
+
+	if (inv_file) {
+		any_available = false;
+		cifsFileInfo_get_locked(inv_file);
+	}
+
 	spin_unlock(&cifs_file_list_lock);
+
+	if (inv_file) {
+		rc = cifs_reopen_file(inv_file, false);
+		if (!rc)
+			return inv_file;
+		else {
+			spin_lock(&cifs_file_list_lock);
+			list_move_tail(&inv_file->flist,
+					&cifs_inode->openFileList);
+			spin_unlock(&cifs_file_list_lock);
+			cifsFileInfo_put(inv_file);
+			spin_lock(&cifs_file_list_lock);
+			++refind;
+			goto refind_writable;
+		}
+	}
+
 	return NULL;
 }
 
@@ -1172,8 +1860,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 
 	open_file = find_writable_file(CIFS_I(mapping->host), false);
 	if (open_file) {
-		bytes_written = cifs_write(open_file, write_data,
-					   to - from, &offset);
+		bytes_written = cifs_write(open_file, open_file->pid,
+					   write_data, to - from, &offset);
 		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
 		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1182,7 +1870,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		else if (bytes_written < 0)
 			rc = bytes_written;
 	} else {
-		cFYI(1, "No writeable filehandles for inode");
+		cifs_dbg(FYI, "No writeable filehandles for inode\n");
 		rc = -EIO;
 	}
 
@@ -1193,64 +1881,21 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	unsigned int bytes_to_write;
-	unsigned int bytes_written;
-	struct cifs_sb_info *cifs_sb;
-	int done = 0;
-	pgoff_t end;
-	pgoff_t index;
-	int range_whole = 0;
-	struct kvec *iov;
-	int len;
-	int n_iov = 0;
-	pgoff_t next;
-	int nr_pages;
-	__u64 offset = 0;
-	struct cifsFileInfo *open_file;
-	struct cifsTconInfo *tcon;
-	struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+	bool done = false, scanned = false, range_whole = false;
+	pgoff_t end, index;
+	struct cifs_writedata *wdata;
+	struct TCP_Server_Info *server;
 	struct page *page;
-	struct pagevec pvec;
 	int rc = 0;
-	int scanned = 0;
-	int xid;
-
-	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
 	/*
-	 * If wsize is smaller that the page cache size, default to writing
+	 * If wsize is smaller than the page cache size, default to writing
 	 * one page at a time via cifs_writepage
 	 */
 	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
 		return generic_writepages(mapping, wbc);
 
-	iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
-	if (iov == NULL)
-		return generic_writepages(mapping, wbc);
-
-	/*
-	 * if there's no open file, then this is likely to fail too,
-	 * but it'll at least handle the return. Maybe it should be
-	 * a BUG() instead?
-	 */
-	open_file = find_writable_file(CIFS_I(mapping->host), false);
-	if (!open_file) {
-		kfree(iov);
-		return generic_writepages(mapping, wbc);
-	}
-
-	tcon = tlink_tcon(open_file->tlink);
-	if (!experimEnabled && tcon->ses->server->secMode &
-			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
-		cifsFileInfo_put(open_file);
-		kfree(iov);
-		return generic_writepages(mapping, wbc);
-	}
-	cifsFileInfo_put(open_file);
-
-	xid = GetXid();
-
-	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -1258,24 +1903,50 @@ static int cifs_writepages(struct address_space *mapping,
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
+			range_whole = true;
+		scanned = true;
 	}
 retry:
-	while (!done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			PAGECACHE_TAG_DIRTY,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1))) {
-		int first;
-		unsigned int i;
+	while (!done && index <= end) {
+		unsigned int i, nr_pages, found_pages;
+		pgoff_t next = 0, tofind;
+		struct page **pages;
 
-		first = -1;
-		next = 0;
-		n_iov = 0;
-		bytes_to_write = 0;
+		tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
+				end - index) + 1;
 
-		for (i = 0; i < nr_pages; i++) {
-			page = pvec.pages[i];
+		wdata = cifs_writedata_alloc((unsigned int)tofind,
+					     cifs_writev_complete);
+		if (!wdata) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * find_get_pages_tag seems to return a max of 256 on each
+		 * iteration, so we must call it several times in order to
+		 * fill the array or the wsize is effectively limited to
+		 * 256 * PAGE_CACHE_SIZE.
+		 */
+		found_pages = 0;
+		pages = wdata->pages;
+		do {
+			nr_pages = find_get_pages_tag(mapping, &index,
+							PAGECACHE_TAG_DIRTY,
+							tofind, pages);
+			found_pages += nr_pages;
+			tofind -= nr_pages;
+			pages += nr_pages;
+		} while (nr_pages && tofind && index <= end);
+
+		if (found_pages == 0) {
+			kref_put(&wdata->refcount, cifs_writedata_release);
+			break;
+		}
+
+		nr_pages = 0;
+		for (i = 0; i < found_pages; i++) {
+			page = wdata->pages[i];
 			/*
 			 * At this point we hold neither mapping->tree_lock nor
 			 * lock on the page itself: the page may be truncated or
@@ -1284,7 +1955,7 @@ retry:
 			 * mapping
 			 */
 
-			if (first < 0)
+			if (nr_pages == 0)
 				lock_page(page);
 			else if (!trylock_page(page))
 				break;
@@ -1295,7 +1966,7 @@ retry:
 			}
 
 			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
+				done = true;
 				unlock_page(page);
 				break;
 			}
@@ -1321,133 +1992,114 @@ retry:
 			 */
 			set_page_writeback(page);
 
-			if (page_offset(page) >= mapping->host->i_size) {
-				done = 1;
+			if (page_offset(page) >= i_size_read(mapping->host)) {
+				done = true;
 				unlock_page(page);
 				end_page_writeback(page);
 				break;
 			}
 
-			/*
-			 * BB can we get rid of this?  pages are held by pvec
-			 */
-			page_cache_get(page);
-
-			len = min(mapping->host->i_size - page_offset(page),
-				  (loff_t)PAGE_CACHE_SIZE);
-
-			/* reserve iov[0] for the smb header */
-			n_iov++;
-			iov[n_iov].iov_base = kmap(page);
-			iov[n_iov].iov_len = len;
-			bytes_to_write += len;
-
-			if (first < 0) {
-				first = i;
-				offset = page_offset(page);
-			}
+			wdata->pages[i] = page;
 			next = page->index + 1;
-			if (bytes_to_write + PAGE_CACHE_SIZE > cifs_sb->wsize)
-				break;
+			++nr_pages;
 		}
-		if (n_iov) {
-retry_write:
-			open_file = find_writable_file(CIFS_I(mapping->host),
-							false);
-			if (!open_file) {
-				cERROR(1, "No writable handles for inode");
-				rc = -EBADF;
-			} else {
-				rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
-						   bytes_to_write, offset,
-						   &bytes_written, iov, n_iov,
-						   0);
-				cifsFileInfo_put(open_file);
-			}
 
-			cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
+		/* reset index to refind any pages skipped */
+		if (nr_pages == 0)
+			index = wdata->pages[0]->index + 1;
 
-			/*
-			 * For now, treat a short write as if nothing got
-			 * written. A zero length write however indicates
-			 * ENOSPC or EFBIG. We have no way to know which
-			 * though, so call it ENOSPC for now. EFBIG would
-			 * get translated to AS_EIO anyway.
-			 *
-			 * FIXME: make it take into account the data that did
-			 *	  get written
-			 */
-			if (rc == 0) {
-				if (bytes_written == 0)
-					rc = -ENOSPC;
-				else if (bytes_written < bytes_to_write)
-					rc = -EAGAIN;
-			}
+		/* put any pages we aren't going to use */
+		for (i = nr_pages; i < found_pages; i++) {
+			page_cache_release(wdata->pages[i]);
+			wdata->pages[i] = NULL;
+		}
 
-			/* retry on data-integrity flush */
-			if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
-				goto retry_write;
+		/* nothing to write? */
+		if (nr_pages == 0) {
+			kref_put(&wdata->refcount, cifs_writedata_release);
+			continue;
+		}
 
-			/* fix the stats and EOF */
-			if (bytes_written > 0) {
-				cifs_stats_bytes_written(tcon, bytes_written);
-				cifs_update_eof(cifsi, offset, bytes_written);
-			}
+		wdata->sync_mode = wbc->sync_mode;
+		wdata->nr_pages = nr_pages;
+		wdata->offset = page_offset(wdata->pages[0]);
+		wdata->pagesz = PAGE_CACHE_SIZE;
+		wdata->tailsz =
+			min(i_size_read(mapping->host) -
+			    page_offset(wdata->pages[nr_pages - 1]),
+			    (loff_t)PAGE_CACHE_SIZE);
+		wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
+					wdata->tailsz;
 
-			for (i = 0; i < n_iov; i++) {
-				page = pvec.pages[first + i];
-				/* on retryable write error, redirty page */
+		do {
+			if (wdata->cfile != NULL)
+				cifsFileInfo_put(wdata->cfile);
+			wdata->cfile = find_writable_file(CIFS_I(mapping->host),
+							  false);
+			if (!wdata->cfile) {
+				cifs_dbg(VFS, "No writable handles for inode\n");
+				rc = -EBADF;
+				break;
+			}
+			wdata->pid = wdata->cfile->pid;
+			server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+			rc = server->ops->async_writev(wdata,
+							cifs_writedata_release);
+		} while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
+
+		for (i = 0; i < nr_pages; ++i)
+			unlock_page(wdata->pages[i]);
+
+		/* send failure -- clean up the mess */
+		if (rc != 0) {
+			for (i = 0; i < nr_pages; ++i) {
 				if (rc == -EAGAIN)
-					redirty_page_for_writepage(wbc, page);
-				else if (rc != 0)
-					SetPageError(page);
-				kunmap(page);
-				unlock_page(page);
-				end_page_writeback(page);
-				page_cache_release(page);
+					redirty_page_for_writepage(wbc,
+							   wdata->pages[i]);
+				else
+					SetPageError(wdata->pages[i]);
+				end_page_writeback(wdata->pages[i]);
+				page_cache_release(wdata->pages[i]);
 			}
-
 			if (rc != -EAGAIN)
 				mapping_set_error(mapping, rc);
-			else
-				rc = 0;
+		}
+		kref_put(&wdata->refcount, cifs_writedata_release);
 
-			if ((wbc->nr_to_write -= n_iov) <= 0)
-				done = 1;
-			index = next;
-		} else
-			/* Need to re-find the pages we skipped */
-			index = pvec.pages[0]->index + 1;
+		wbc->nr_to_write -= nr_pages;
+		if (wbc->nr_to_write <= 0)
+			done = true;
 
-		pagevec_release(&pvec);
+		index = next;
 	}
+
 	if (!scanned && !done) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
 		 */
-		scanned = 1;
+		scanned = true;
 		index = 0;
 		goto retry;
 	}
+
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
-	FreeXid(xid);
-	kfree(iov);
 	return rc;
 }
 
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
-	int rc = -EFAULT;
-	int xid;
+	int rc;
+	unsigned int xid;
 
-	xid = GetXid();
+	xid = get_xid();
 /* BB add check for wbc flags */
 	page_cache_get(page);
 	if (!PageUptodate(page))
-		cFYI(1, "ppw - page not up to date");
+		cifs_dbg(FYI, "ppw - page not up to date\n");
 
 	/*
 	 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1460,12 +2112,26 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 	 * to fail to update with the state of the page correctly.
 	 */
 	set_page_writeback(page);
+retry_write:
 	rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
-	SetPageUptodate(page); /* BB add check for error and Clearuptodate? */
-	unlock_page(page);
+	if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
+		goto retry_write;
+	else if (rc == -EAGAIN)
+		redirty_page_for_writepage(wbc, page);
+	else if (rc != 0)
+		SetPageError(page);
+	else
+		SetPageUptodate(page);
 	end_page_writeback(page);
 	page_cache_release(page);
-	FreeXid(xid);
+	free_xid(xid);
+	return rc;
+}
+
+static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int rc = cifs_writepage_locked(page, wbc);
+	unlock_page(page);
 	return rc;
 }
 
@@ -1475,8 +2141,16 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 {
 	int rc;
 	struct inode *inode = mapping->host;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+	__u32 pid;
 
-	cFYI(1, "write_end for page %p from pos %lld with %d bytes",
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = cfile->pid;
+	else
+		pid = current->tgid;
+
+	cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
 		 page, pos, copied);
 
 	if (PageChecked(page)) {
@@ -1489,21 +2163,20 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	if (!PageUptodate(page)) {
 		char *page_data;
 		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		int xid;
+		unsigned int xid;
 
-		xid = GetXid();
+		xid = get_xid();
 		/* this is probably better than directly calling
 		   partialpage_write since in this function the file handle is
 		   known which we might as well	leverage */
 		/* BB check if anything else missing out of ppw
 		   such as updating last write time */
 		page_data = kmap(page);
-		rc = cifs_write(file->private_data, page_data + offset,
-				copied, &pos);
+		rc = cifs_write(cfile, pid, page_data + offset, copied, &pos);
 		/* if (rc < 0) should we set writebehind rc? */
 		kunmap(page);
 
-		FreeXid(xid);
+		free_xid(xid);
 	} else {
 		rc = copied;
 		pos += copied;
@@ -1523,93 +2196,96 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
-int cifs_strict_fsync(struct file *file, int datasync)
+int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
+		      int datasync)
 {
-	int xid;
+	unsigned int xid;
 	int rc = 0;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 	struct cifsFileInfo *smbfile = file->private_data;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
-	xid = GetXid();
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+	mutex_lock(&inode->i_mutex);
+
+	xid = get_xid();
 
-	cFYI(1, "Sync file - name: %s datasync: 0x%x",
-		file->f_path.dentry->d_name.name, datasync);
+	cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
+		 file->f_path.dentry->d_name.name, datasync);
 
-	if (!CIFS_I(inode)->clientCanCacheRead)
-		cifs_invalidate_mapping(inode);
+	if (!CIFS_CACHE_READ(CIFS_I(inode))) {
+		rc = cifs_zap_mapping(inode);
+		if (rc) {
+			cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
+			rc = 0; /* don't care about it in fsync */
+		}
+	}
 
 	tcon = tlink_tcon(smbfile->tlink);
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
+		server = tcon->ses->server;
+		if (server->ops->flush)
+			rc = server->ops->flush(xid, tcon, &smbfile->fid);
+		else
+			rc = -ENOSYS;
+	}
 
-	FreeXid(xid);
+	free_xid(xid);
+	mutex_unlock(&inode->i_mutex);
 	return rc;
 }
 
-int cifs_fsync(struct file *file, int datasync)
+int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	int xid;
+	unsigned int xid;
 	int rc = 0;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 	struct cifsFileInfo *smbfile = file->private_data;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	struct inode *inode = file->f_mapping->host;
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (rc)
+		return rc;
+	mutex_lock(&inode->i_mutex);
 
-	xid = GetXid();
+	xid = get_xid();
 
-	cFYI(1, "Sync file - name: %s datasync: 0x%x",
-		file->f_path.dentry->d_name.name, datasync);
+	cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
+		 file->f_path.dentry->d_name.name, datasync);
 
 	tcon = tlink_tcon(smbfile->tlink);
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-		rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
+		server = tcon->ses->server;
+		if (server->ops->flush)
+			rc = server->ops->flush(xid, tcon, &smbfile->fid);
+		else
+			rc = -ENOSYS;
+	}
 
-	FreeXid(xid);
+	free_xid(xid);
+	mutex_unlock(&inode->i_mutex);
 	return rc;
 }
 
-/* static void cifs_sync_page(struct page *page)
-{
-	struct address_space *mapping;
-	struct inode *inode;
-	unsigned long index = page->index;
-	unsigned int rpages = 0;
-	int rc = 0;
-
-	cFYI(1, "sync page %p", page);
-	mapping = page->mapping;
-	if (!mapping)
-		return 0;
-	inode = mapping->host;
-	if (!inode)
-		return; */
-
-/*	fill in rpages then
-	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-
-/*	cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
-
-#if 0
-	if (rc < 0)
-		return rc;
-	return 0;
-#endif
-} */
-
 /*
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
  */
 int cifs_flush(struct file *file, fl_owner_t id)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int rc = 0;
 
 	if (file->f_mode & FMODE_WRITE)
 		rc = filemap_write_and_wait(inode->i_mapping);
 
-	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
+	cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc);
 
 	return rc;
 }
@@ -1621,7 +2297,7 @@ cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
 	unsigned long i;
 
 	for (i = 0; i < num_pages; i++) {
-		pages[i] = alloc_page(__GFP_HIGHMEM);
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
 		if (!pages[i]) {
 			/*
 			 * save number of pages we have already allocated and
@@ -1629,15 +2305,14 @@ cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
 			 */
 			num_pages = i;
 			rc = -ENOMEM;
-			goto error;
+			break;
 		}
 	}
 
-	return rc;
-
-error:
-	for (i = 0; i < num_pages; i++)
-		put_page(pages[i]);
+	if (rc) {
+		for (i = 0; i < num_pages; i++)
+			put_page(pages[i]);
+	}
 	return rc;
 }
 
@@ -1648,9 +2323,7 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
 	size_t clen;
 
 	clen = min_t(const size_t, len, wsize);
-	num_pages = clen / PAGE_CACHE_SIZE;
-	if (clen % PAGE_CACHE_SIZE)
-		num_pages++;
+	num_pages = DIV_ROUND_UP(clen, PAGE_SIZE);
 
 	if (cur_len)
 		*cur_len = clen;
@@ -1658,127 +2331,226 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
 	return num_pages;
 }
 
+static void
+cifs_uncached_writedata_release(struct kref *refcount)
+{
+	int i;
+	struct cifs_writedata *wdata = container_of(refcount,
+					struct cifs_writedata, refcount);
+
+	for (i = 0; i < wdata->nr_pages; i++)
+		put_page(wdata->pages[i]);
+	cifs_writedata_release(refcount);
+}
+
+static void
+cifs_uncached_writev_complete(struct work_struct *work)
+{
+	struct cifs_writedata *wdata = container_of(work,
+					struct cifs_writedata, work);
+	struct inode *inode = wdata->cfile->dentry->d_inode;
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
+	if (cifsi->server_eof > inode->i_size)
+		i_size_write(inode, cifsi->server_eof);
+	spin_unlock(&inode->i_lock);
+
+	complete(&wdata->done);
+
+	kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+}
+
+/* attempt to send write to server, retry on any -EAGAIN errors */
+static int
+cifs_uncached_retry_writev(struct cifs_writedata *wdata)
+{
+	int rc;
+	struct TCP_Server_Info *server;
+
+	server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+
+	do {
+		if (wdata->cfile->invalidHandle) {
+			rc = cifs_reopen_file(wdata->cfile, false);
+			if (rc != 0)
+				continue;
+		}
+		rc = server->ops->async_writev(wdata,
+					       cifs_uncached_writedata_release);
+	} while (rc == -EAGAIN);
+
+	return rc;
+}
+
 static ssize_t
-cifs_iovec_write(struct file *file, const struct iovec *iov,
-		 unsigned long nr_segs, loff_t *poffset)
+cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
 {
-	unsigned int written;
-	unsigned long num_pages, npages, i;
-	size_t copied, len, cur_len;
+	unsigned long nr_pages, i;
+	size_t bytes, copied, len, cur_len;
 	ssize_t total_written = 0;
-	struct kvec *to_send;
-	struct page **pages;
-	struct iov_iter it;
-	struct inode *inode;
+	loff_t offset;
 	struct cifsFileInfo *open_file;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *tcon;
 	struct cifs_sb_info *cifs_sb;
-	int xid, rc;
-
-	len = iov_length(iov, nr_segs);
-	if (!len)
-		return 0;
+	struct cifs_writedata *wdata, *tmp;
+	struct list_head wdata_list;
+	int rc;
+	pid_t pid;
 
+	len = iov_iter_count(from);
 	rc = generic_write_checks(file, poffset, &len, 0);
 	if (rc)
 		return rc;
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+	if (!len)
+		return 0;
 
-	pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
+	iov_iter_truncate(from, len);
 
-	to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
-	if (!to_send) {
-		kfree(pages);
-		return -ENOMEM;
-	}
+	INIT_LIST_HEAD(&wdata_list);
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	open_file = file->private_data;
+	tcon = tlink_tcon(open_file->tlink);
 
-	rc = cifs_write_allocate_pages(pages, num_pages);
-	if (rc) {
-		kfree(pages);
-		kfree(to_send);
-		return rc;
-	}
+	if (!tcon->ses->server->ops->async_writev)
+		return -ENOSYS;
 
-	xid = GetXid();
-	open_file = file->private_data;
-	pTcon = tlink_tcon(open_file->tlink);
-	inode = file->f_path.dentry->d_inode;
+	offset = *poffset;
 
-	iov_iter_init(&it, iov, nr_segs, len, 0);
-	npages = num_pages;
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
 
 	do {
-		size_t save_len = cur_len;
-		for (i = 0; i < npages; i++) {
-			copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
-			copied = iov_iter_copy_from_user(pages[i], &it, 0,
-							 copied);
-			cur_len -= copied;
-			iov_iter_advance(&it, copied);
-			to_send[i+1].iov_base = kmap(pages[i]);
-			to_send[i+1].iov_len = copied;
+		size_t save_len;
+
+		nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+		wdata = cifs_writedata_alloc(nr_pages,
+					     cifs_uncached_writev_complete);
+		if (!wdata) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
+		if (rc) {
+			kfree(wdata);
+			break;
 		}
 
+		save_len = cur_len;
+		for (i = 0; i < nr_pages; i++) {
+			bytes = min_t(size_t, cur_len, PAGE_SIZE);
+			copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
+						     from);
+			cur_len -= copied;
+			/*
+			 * If we didn't copy as much as we expected, then that
+			 * may mean we trod into an unmapped area. Stop copying
+			 * at that point. On the next pass through the big
+			 * loop, we'll likely end up getting a zero-length
+			 * write and bailing out of it.
+			 */
+			if (copied < bytes)
+				break;
+		}
 		cur_len = save_len - cur_len;
 
-		do {
-			if (open_file->invalidHandle) {
-				rc = cifs_reopen_file(open_file, false);
-				if (rc != 0)
-					break;
-			}
-			rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
-					   cur_len, *poffset, &written,
-					   to_send, npages, 0);
-		} while (rc == -EAGAIN);
-
-		for (i = 0; i < npages; i++)
-			kunmap(pages[i]);
-
-		if (written) {
-			len -= written;
-			total_written += written;
-			cifs_update_eof(CIFS_I(inode), *poffset, written);
-			*poffset += written;
-		} else if (rc < 0) {
-			if (!total_written)
-				total_written = rc;
+		/*
+		 * If we have no data to send, then that probably means that
+		 * the copy above failed altogether. That's most likely because
+		 * the address in the iovec was bogus. Set the rc to -EFAULT,
+		 * free anything we allocated and bail out.
+		 */
+		if (!cur_len) {
+			for (i = 0; i < nr_pages; i++)
+				put_page(wdata->pages[i]);
+			kfree(wdata);
+			rc = -EFAULT;
 			break;
 		}
 
-		/* get length and number of kvecs of the next write */
-		npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+		/*
+		 * i + 1 now represents the number of pages we actually used in
+		 * the copy phase above. Bring nr_pages down to that, and free
+		 * any pages that we didn't use.
+		 */
+		for ( ; nr_pages > i + 1; nr_pages--)
+			put_page(wdata->pages[nr_pages - 1]);
+
+		wdata->sync_mode = WB_SYNC_ALL;
+		wdata->nr_pages = nr_pages;
+		wdata->offset = (__u64)offset;
+		wdata->cfile = cifsFileInfo_get(open_file);
+		wdata->pid = pid;
+		wdata->bytes = cur_len;
+		wdata->pagesz = PAGE_SIZE;
+		wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
+		rc = cifs_uncached_retry_writev(wdata);
+		if (rc) {
+			kref_put(&wdata->refcount,
+				 cifs_uncached_writedata_release);
+			break;
+		}
+
+		list_add_tail(&wdata->list, &wdata_list);
+		offset += cur_len;
+		len -= cur_len;
 	} while (len > 0);
 
-	if (total_written > 0) {
-		spin_lock(&inode->i_lock);
-		if (*poffset > inode->i_size)
-			i_size_write(inode, *poffset);
-		spin_unlock(&inode->i_lock);
+	/*
+	 * If at least one write was successfully sent, then discard any rc
+	 * value from the later writes. If the other write succeeds, then
+	 * we'll end up returning whatever was written. If it fails, then
+	 * we'll get a new rc value from that.
+	 */
+	if (!list_empty(&wdata_list))
+		rc = 0;
+
+	/*
+	 * Wait for and collect replies for any successful sends in order of
+	 * increasing offset. Once an error is hit or we get a fatal signal
+	 * while waiting, then return without waiting for any more replies.
+	 */
+restart_loop:
+	list_for_each_entry_safe(wdata, tmp, &wdata_list, list) {
+		if (!rc) {
+			/* FIXME: freezable too? */
+			rc = wait_for_completion_killable(&wdata->done);
+			if (rc)
+				rc = -EINTR;
+			else if (wdata->result)
+				rc = wdata->result;
+			else
+				total_written += wdata->bytes;
+
+			/* resend call if it's a retryable error */
+			if (rc == -EAGAIN) {
+				rc = cifs_uncached_retry_writev(wdata);
+				goto restart_loop;
+			}
+		}
+		list_del_init(&wdata->list);
+		kref_put(&wdata->refcount, cifs_uncached_writedata_release);
 	}
 
-	cifs_stats_bytes_written(pTcon, total_written);
-	mark_inode_dirty_sync(inode);
+	if (total_written > 0)
+		*poffset += total_written;
 
-	for (i = 0; i < num_pages; i++)
-		put_page(pages[i]);
-	kfree(to_send);
-	kfree(pages);
-	FreeXid(xid);
-	return total_written;
+	cifs_stats_bytes_written(tcon, total_written);
+	return total_written ? total_written : (ssize_t)rc;
 }
 
-static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos)
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
 {
 	ssize_t written;
 	struct inode *inode;
+	loff_t pos = iocb->ki_pos;
 
-	inode = iocb->ki_filp->f_path.dentry->d_inode;
+	inode = file_inode(iocb->ki_filp);
 
 	/*
 	 * BB - optimize the way when signing is disabled. We can drop this
@@ -1786,146 +2558,400 @@ static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 	 * write request.
 	 */
 
-	written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+	written = cifs_iovec_write(iocb->ki_filp, from, &pos);
 	if (written > 0) {
-		CIFS_I(inode)->invalid_mapping = true;
+		set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
 		iocb->ki_pos = pos;
 	}
 
 	return written;
 }
 
-ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
-			   unsigned long nr_segs, loff_t pos)
+static ssize_t
+cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct inode *inode;
+	struct file *file = iocb->ki_filp;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+	ssize_t rc = -EACCES;
+	loff_t lock_pos = iocb->ki_pos;
 
-	inode = iocb->ki_filp->f_path.dentry->d_inode;
+	/*
+	 * We need to hold the sem to be sure nobody modifies lock list
+	 * with a brlock that prevents writing.
+	 */
+	down_read(&cinode->lock_sem);
+	mutex_lock(&inode->i_mutex);
+	if (file->f_flags & O_APPEND)
+		lock_pos = i_size_read(inode);
+	if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
+				     server->vals->exclusive_lock_type, NULL,
+				     CIFS_WRITE_OP)) {
+		rc = __generic_file_write_iter(iocb, from);
+		mutex_unlock(&inode->i_mutex);
+
+		if (rc > 0) {
+			ssize_t err;
+
+			err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+			if (err < 0)
+				rc = err;
+		}
+	} else {
+		mutex_unlock(&inode->i_mutex);
+	}
+	up_read(&cinode->lock_sem);
+	return rc;
+}
 
-	if (CIFS_I(inode)->clientCanCacheAll)
-		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ssize_t
+cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)
+						iocb->ki_filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	ssize_t written;
+
+	written = cifs_get_writer(cinode);
+	if (written)
+		return written;
 
+	if (CIFS_CACHE_WRITE(cinode)) {
+		if (cap_unix(tcon->ses) &&
+		(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
+		  && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+			written = generic_file_write_iter(iocb, from);
+			goto out;
+		}
+		written = cifs_writev(iocb, from);
+		goto out;
+	}
 	/*
-	 * In strict cache mode we need to write the data to the server exactly
-	 * from the pos to pos+len-1 rather than flush all affected pages
-	 * because it may cause a error with mandatory locks on these pages but
-	 * not on the region from pos to ppos+len-1.
+	 * For non-oplocked files in strict cache mode we need to write the data
+	 * to the server exactly from the pos to pos+len-1 rather than flush all
+	 * affected pages because it may cause a error with mandatory locks on
+	 * these pages but not on the region from pos to ppos+len-1.
 	 */
+	written = cifs_user_writev(iocb, from);
+	if (written > 0 && CIFS_CACHE_READ(cinode)) {
+		/*
+		 * Windows 7 server can delay breaking level2 oplock if a write
+		 * request comes - break it on the client to prevent reading
+		 * an old data.
+		 */
+		cifs_zap_mapping(inode);
+		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+			 inode);
+		cinode->oplock = 0;
+	}
+out:
+	cifs_put_writer(cinode);
+	return written;
+}
 
-	return cifs_user_writev(iocb, iov, nr_segs, pos);
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+	struct cifs_readdata *rdata;
+
+	rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages),
+			GFP_KERNEL);
+	if (rdata != NULL) {
+		kref_init(&rdata->refcount);
+		INIT_LIST_HEAD(&rdata->list);
+		init_completion(&rdata->done);
+		INIT_WORK(&rdata->work, complete);
+	}
+
+	return rdata;
 }
 
-static ssize_t
-cifs_iovec_read(struct file *file, const struct iovec *iov,
-		 unsigned long nr_segs, loff_t *poffset)
+void
+cifs_readdata_release(struct kref *refcount)
+{
+	struct cifs_readdata *rdata = container_of(refcount,
+					struct cifs_readdata, refcount);
+
+	if (rdata->cfile)
+		cifsFileInfo_put(rdata->cfile);
+
+	kfree(rdata);
+}
+
+static int
+cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages)
+{
+	int rc = 0;
+	struct page *page;
+	unsigned int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		if (!page) {
+			rc = -ENOMEM;
+			break;
+		}
+		rdata->pages[i] = page;
+	}
+
+	if (rc) {
+		for (i = 0; i < nr_pages; i++) {
+			put_page(rdata->pages[i]);
+			rdata->pages[i] = NULL;
+		}
+	}
+	return rc;
+}
+
+static void
+cifs_uncached_readdata_release(struct kref *refcount)
+{
+	struct cifs_readdata *rdata = container_of(refcount,
+					struct cifs_readdata, refcount);
+	unsigned int i;
+
+	for (i = 0; i < rdata->nr_pages; i++) {
+		put_page(rdata->pages[i]);
+		rdata->pages[i] = NULL;
+	}
+	cifs_readdata_release(refcount);
+}
+
+static int
+cifs_retry_async_readv(struct cifs_readdata *rdata)
 {
 	int rc;
-	int xid;
-	ssize_t total_read;
-	unsigned int bytes_read = 0;
+	struct TCP_Server_Info *server;
+
+	server = tlink_tcon(rdata->cfile->tlink)->ses->server;
+
+	do {
+		if (rdata->cfile->invalidHandle) {
+			rc = cifs_reopen_file(rdata->cfile, true);
+			if (rc != 0)
+				continue;
+		}
+		rc = server->ops->async_readv(rdata);
+	} while (rc == -EAGAIN);
+
+	return rc;
+}
+
+/**
+ * cifs_readdata_to_iov - copy data from pages in response to an iovec
+ * @rdata:	the readdata response with list of pages holding data
+ * @iter:	destination for our data
+ *
+ * This function copies data from a list of pages in a readdata response into
+ * an array of iovecs. It will first calculate where the data should go
+ * based on the info in the readdata and then copy the data into that spot.
+ */
+static int
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
+{
+	size_t remaining = rdata->bytes;
+	unsigned int i;
+
+	for (i = 0; i < rdata->nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+		size_t copy = min_t(size_t, remaining, PAGE_SIZE);
+		size_t written = copy_page_to_iter(page, 0, copy, iter);
+		remaining -= written;
+		if (written < copy && iov_iter_count(iter) > 0)
+			break;
+	}
+	return remaining ? -EFAULT : 0;
+}
+
+static void
+cifs_uncached_readv_complete(struct work_struct *work)
+{
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
+
+	complete(&rdata->done);
+	kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+}
+
+static int
+cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
+			struct cifs_readdata *rdata, unsigned int len)
+{
+	int total_read = 0, result = 0;
+	unsigned int i;
+	unsigned int nr_pages = rdata->nr_pages;
+	struct kvec iov;
+
+	rdata->tailsz = PAGE_SIZE;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+
+		if (len >= PAGE_SIZE) {
+			/* enough data to fill the page */
+			iov.iov_base = kmap(page);
+			iov.iov_len = PAGE_SIZE;
+			cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+				 i, iov.iov_base, iov.iov_len);
+			len -= PAGE_SIZE;
+		} else if (len > 0) {
+			/* enough for partial page, fill and zero the rest */
+			iov.iov_base = kmap(page);
+			iov.iov_len = len;
+			cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+				 i, iov.iov_base, iov.iov_len);
+			memset(iov.iov_base + len, '\0', PAGE_SIZE - len);
+			rdata->tailsz = len;
+			len = 0;
+		} else {
+			/* no need to hold page hostage */
+			rdata->pages[i] = NULL;
+			rdata->nr_pages--;
+			put_page(page);
+			continue;
+		}
+
+		result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
+		kunmap(page);
+		if (result < 0)
+			break;
+
+		total_read += result;
+	}
+
+	return total_read > 0 ? total_read : result;
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t rc;
 	size_t len, cur_len;
-	int iov_offset = 0;
+	ssize_t total_read = 0;
+	loff_t offset = iocb->ki_pos;
+	unsigned int npages;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *tcon;
 	struct cifsFileInfo *open_file;
-	struct smb_com_read_rsp *pSMBr;
-	char *read_data;
-
-	if (!nr_segs)
-		return 0;
+	struct cifs_readdata *rdata, *tmp;
+	struct list_head rdata_list;
+	pid_t pid;
 
-	len = iov_length(iov, nr_segs);
+	len = iov_iter_count(to);
 	if (!len)
 		return 0;
 
-	xid = GetXid();
+	INIT_LIST_HEAD(&rdata_list);
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
 	open_file = file->private_data;
-	pTcon = tlink_tcon(open_file->tlink);
+	tcon = tlink_tcon(open_file->tlink);
+
+	if (!tcon->ses->server->ops->async_readv)
+		return -ENOSYS;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, "attempting read on write only file instance");
+		cifs_dbg(FYI, "attempting read on write only file instance\n");
 
-	for (total_read = 0; total_read < len; total_read += bytes_read) {
+	do {
 		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
-		rc = -EAGAIN;
-		read_data = NULL;
+		npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
 
-		while (rc == -EAGAIN) {
-			int buf_type = CIFS_NO_BUFFER;
-			if (open_file->invalidHandle) {
-				rc = cifs_reopen_file(open_file, true);
-				if (rc != 0)
-					break;
-			}
-			rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
-					 cur_len, *poffset, &bytes_read,
-					 &read_data, &buf_type);
-			pSMBr = (struct smb_com_read_rsp *)read_data;
-			if (read_data) {
-				char *data_offset = read_data + 4 +
-						le16_to_cpu(pSMBr->DataOffset);
-				if (memcpy_toiovecend(iov, data_offset,
-						      iov_offset, bytes_read))
-					rc = -EFAULT;
-				if (buf_type == CIFS_SMALL_BUFFER)
-					cifs_small_buf_release(read_data);
-				else if (buf_type == CIFS_LARGE_BUFFER)
-					cifs_buf_release(read_data);
-				read_data = NULL;
-				iov_offset += bytes_read;
-			}
+		/* allocate a readdata struct */
+		rdata = cifs_readdata_alloc(npages,
+					    cifs_uncached_readv_complete);
+		if (!rdata) {
+			rc = -ENOMEM;
+			break;
 		}
 
-		if (rc || (bytes_read == 0)) {
-			if (total_read) {
-				break;
+		rc = cifs_read_allocate_pages(rdata, npages);
+		if (rc)
+			goto error;
+
+		rdata->cfile = cifsFileInfo_get(open_file);
+		rdata->nr_pages = npages;
+		rdata->offset = offset;
+		rdata->bytes = cur_len;
+		rdata->pid = pid;
+		rdata->pagesz = PAGE_SIZE;
+		rdata->read_into_pages = cifs_uncached_read_into_pages;
+
+		rc = cifs_retry_async_readv(rdata);
+error:
+		if (rc) {
+			kref_put(&rdata->refcount,
+				 cifs_uncached_readdata_release);
+			break;
+		}
+
+		list_add_tail(&rdata->list, &rdata_list);
+		offset += cur_len;
+		len -= cur_len;
+	} while (len > 0);
+
+	/* if at least one read request send succeeded, then reset rc */
+	if (!list_empty(&rdata_list))
+		rc = 0;
+
+	len = iov_iter_count(to);
+	/* the loop below should proceed in the order of increasing offsets */
+	list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+	again:
+		if (!rc) {
+			/* FIXME: freezable sleep too? */
+			rc = wait_for_completion_killable(&rdata->done);
+			if (rc)
+				rc = -EINTR;
+			else if (rdata->result) {
+				rc = rdata->result;
+				/* resend call if it's a retryable error */
+				if (rc == -EAGAIN) {
+					rc = cifs_retry_async_readv(rdata);
+					goto again;
+				}
 			} else {
-				FreeXid(xid);
-				return rc;
+				rc = cifs_readdata_to_iov(rdata, to);
 			}
-		} else {
-			cifs_stats_bytes_read(pTcon, bytes_read);
-			*poffset += bytes_read;
+
 		}
+		list_del_init(&rdata->list);
+		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
 	}
 
-	FreeXid(xid);
-	return total_read;
-}
+	total_read = len - iov_iter_count(to);
 
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
-		       size_t read_size, loff_t *poffset)
-{
-	struct iovec iov;
-	iov.iov_base = read_data;
-	iov.iov_len = read_size;
-
-	return cifs_iovec_read(file, &iov, 1, poffset);
-}
+	cifs_stats_bytes_read(tcon, total_read);
 
-static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos)
-{
-	ssize_t read;
-
-	read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
-	if (read > 0)
-		iocb->ki_pos = pos;
+	/* mask nodata case */
+	if (rc == -ENODATA)
+		rc = 0;
 
-	return read;
+	if (total_read) {
+		iocb->ki_pos += total_read;
+		return total_read;
+	}
+	return rc;
 }
 
-ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
-			  unsigned long nr_segs, loff_t pos)
+ssize_t
+cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
 {
-	struct inode *inode;
-
-	inode = iocb->ki_filp->f_path.dentry->d_inode;
-
-	if (CIFS_I(inode)->clientCanCacheRead)
-		return generic_file_aio_read(iocb, iov, nr_segs, pos);
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *)
+						iocb->ki_filp->private_data;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	int rc = -EACCES;
 
 	/*
 	 * In strict cache mode we need to read from the server all the time
@@ -1935,49 +2961,84 @@ ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
 	 * on pages affected by this read but not on the region from pos to
 	 * pos+len-1.
 	 */
+	if (!CIFS_CACHE_READ(cinode))
+		return cifs_user_readv(iocb, to);
+
+	if (cap_unix(tcon->ses) &&
+	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+		return generic_file_read_iter(iocb, to);
 
-	return cifs_user_readv(iocb, iov, nr_segs, pos);
+	/*
+	 * We need to hold the sem to be sure nobody modifies lock list
+	 * with a brlock that prevents reading.
+	 */
+	down_read(&cinode->lock_sem);
+	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
+				     tcon->ses->server->vals->shared_lock_type,
+				     NULL, CIFS_READ_OP))
+		rc = generic_file_read_iter(iocb, to);
+	up_read(&cinode->lock_sem);
+	return rc;
 }
 
-static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
-			 loff_t *poffset)
+static ssize_t
+cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
 {
 	int rc = -EACCES;
 	unsigned int bytes_read = 0;
 	unsigned int total_read;
 	unsigned int current_read_size;
+	unsigned int rsize;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	int xid;
-	char *current_offset;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	unsigned int xid;
+	char *cur_offset;
 	struct cifsFileInfo *open_file;
+	struct cifs_io_parms io_parms;
 	int buf_type = CIFS_NO_BUFFER;
+	__u32 pid;
 
-	xid = GetXid();
+	xid = get_xid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
+	/* FIXME: set up handlers for larger reads and/or convert to async */
+	rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
 	if (file->private_data == NULL) {
 		rc = -EBADF;
-		FreeXid(xid);
+		free_xid(xid);
 		return rc;
 	}
 	open_file = file->private_data;
-	pTcon = tlink_tcon(open_file->tlink);
+	tcon = tlink_tcon(open_file->tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->sync_read) {
+		free_xid(xid);
+		return -ENOSYS;
+	}
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, "attempting read on write only file instance");
-
-	for (total_read = 0, current_offset = read_data;
-	     read_size > total_read;
-	     total_read += bytes_read, current_offset += bytes_read) {
-		current_read_size = min_t(const int, read_size - total_read,
-					  cifs_sb->rsize);
-		/* For windows me and 9x we do not want to request more
-		than it negotiated since it will refuse the read then */
-		if ((pTcon->ses) &&
-			!(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
-			current_read_size = min_t(const int, current_read_size,
-					pTcon->ses->server->maxBuf - 128);
+		cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+	for (total_read = 0, cur_offset = read_data; read_size > total_read;
+	     total_read += bytes_read, cur_offset += bytes_read) {
+		current_read_size = min_t(uint, read_size - total_read, rsize);
+		/*
+		 * For windows me and 9x we do not want to request more than it
+		 * negotiated since it will refuse the read then.
+		 */
+		if ((tcon->ses) && !(tcon->ses->capabilities &
+				tcon->ses->server->vals->cap_large_files)) {
+			current_read_size = min_t(uint, current_read_size,
+					CIFSMaxBufSize);
 		}
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
@@ -1986,40 +3047,67 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 				if (rc != 0)
 					break;
 			}
-			rc = CIFSSMBRead(xid, pTcon,
-					 open_file->netfid,
-					 current_read_size, *poffset,
-					 &bytes_read, &current_offset,
-					 &buf_type);
+			io_parms.pid = pid;
+			io_parms.tcon = tcon;
+			io_parms.offset = *offset;
+			io_parms.length = current_read_size;
+			rc = server->ops->sync_read(xid, open_file, &io_parms,
+						    &bytes_read, &cur_offset,
+						    &buf_type);
 		}
 		if (rc || (bytes_read == 0)) {
 			if (total_read) {
 				break;
 			} else {
-				FreeXid(xid);
+				free_xid(xid);
 				return rc;
 			}
 		} else {
-			cifs_stats_bytes_read(pTcon, total_read);
-			*poffset += bytes_read;
+			cifs_stats_bytes_read(tcon, total_read);
+			*offset += bytes_read;
 		}
 	}
-	FreeXid(xid);
+	free_xid(xid);
 	return total_read;
 }
 
+/*
+ * If the page is mmap'ed into a process' page tables, then we need to make
+ * sure that it doesn't change while being written back.
+ */
+static int
+cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+
+	lock_page(page);
+	return VM_FAULT_LOCKED;
+}
+
+static struct vm_operations_struct cifs_file_vm_ops = {
+	.fault = filemap_fault,
+	.map_pages = filemap_map_pages,
+	.page_mkwrite = cifs_page_mkwrite,
+	.remap_pages = generic_file_remap_pages,
+};
+
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int rc, xid;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
-	xid = GetXid();
+	xid = get_xid();
 
-	if (!CIFS_I(inode)->clientCanCacheRead)
-		cifs_invalidate_mapping(inode);
+	if (!CIFS_CACHE_READ(CIFS_I(inode))) {
+		rc = cifs_zap_mapping(inode);
+		if (rc)
+			return rc;
+	}
 
 	rc = generic_file_mmap(file, vma);
-	FreeXid(xid);
+	if (rc == 0)
+		vma->vm_ops = &cifs_file_vm_ops;
+	free_xid(xid);
 	return rc;
 }
 
@@ -2027,216 +3115,283 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int rc, xid;
 
-	xid = GetXid();
+	xid = get_xid();
 	rc = cifs_revalidate_file(file);
 	if (rc) {
-		cFYI(1, "Validation prior to mmap failed, error=%d", rc);
-		FreeXid(xid);
+		cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
+			 rc);
+		free_xid(xid);
 		return rc;
 	}
 	rc = generic_file_mmap(file, vma);
-	FreeXid(xid);
+	if (rc == 0)
+		vma->vm_ops = &cifs_file_vm_ops;
+	free_xid(xid);
 	return rc;
 }
 
-
-static void cifs_copy_cache_pages(struct address_space *mapping,
-	struct list_head *pages, int bytes_read, char *data)
+static void
+cifs_readv_complete(struct work_struct *work)
 {
-	struct page *page;
-	char *target;
+	unsigned int i;
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
 
-	while (bytes_read > 0) {
-		if (list_empty(pages))
-			break;
+	for (i = 0; i < rdata->nr_pages; i++) {
+		struct page *page = rdata->pages[i];
 
-		page = list_entry(pages->prev, struct page, lru);
-		list_del(&page->lru);
+		lru_cache_add_file(page);
 
-		if (add_to_page_cache_lru(page, mapping, page->index,
-				      GFP_KERNEL)) {
-			page_cache_release(page);
-			cFYI(1, "Add page cache failed");
-			data += PAGE_CACHE_SIZE;
-			bytes_read -= PAGE_CACHE_SIZE;
-			continue;
+		if (rdata->result == 0) {
+			flush_dcache_page(page);
+			SetPageUptodate(page);
 		}
-		page_cache_release(page);
 
-		target = kmap_atomic(page, KM_USER0);
+		unlock_page(page);
+
+		if (rdata->result == 0)
+			cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+		page_cache_release(page);
+		rdata->pages[i] = NULL;
+	}
+	kref_put(&rdata->refcount, cifs_readdata_release);
+}
 
-		if (PAGE_CACHE_SIZE > bytes_read) {
-			memcpy(target, data, bytes_read);
-			/* zero the tail end of this partial page */
-			memset(target + bytes_read, 0,
-			       PAGE_CACHE_SIZE - bytes_read);
-			bytes_read = 0;
+static int
+cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
+			struct cifs_readdata *rdata, unsigned int len)
+{
+	int total_read = 0, result = 0;
+	unsigned int i;
+	u64 eof;
+	pgoff_t eof_index;
+	unsigned int nr_pages = rdata->nr_pages;
+	struct kvec iov;
+
+	/* determine the eof that the server (probably) has */
+	eof = CIFS_I(rdata->mapping->host)->server_eof;
+	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+	cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+
+	rdata->tailsz = PAGE_CACHE_SIZE;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = rdata->pages[i];
+
+		if (len >= PAGE_CACHE_SIZE) {
+			/* enough data to fill the page */
+			iov.iov_base = kmap(page);
+			iov.iov_len = PAGE_CACHE_SIZE;
+			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+				 i, page->index, iov.iov_base, iov.iov_len);
+			len -= PAGE_CACHE_SIZE;
+		} else if (len > 0) {
+			/* enough for partial page, fill and zero the rest */
+			iov.iov_base = kmap(page);
+			iov.iov_len = len;
+			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+				 i, page->index, iov.iov_base, iov.iov_len);
+			memset(iov.iov_base + len,
+				'\0', PAGE_CACHE_SIZE - len);
+			rdata->tailsz = len;
+			len = 0;
+		} else if (page->index > eof_index) {
+			/*
+			 * The VFS will not try to do readahead past the
+			 * i_size, but it's possible that we have outstanding
+			 * writes with gaps in the middle and the i_size hasn't
+			 * caught up yet. Populate those with zeroed out pages
+			 * to prevent the VFS from repeatedly attempting to
+			 * fill them until the writes are flushed.
+			 */
+			zero_user(page, 0, PAGE_CACHE_SIZE);
+			lru_cache_add_file(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+			unlock_page(page);
+			page_cache_release(page);
+			rdata->pages[i] = NULL;
+			rdata->nr_pages--;
+			continue;
 		} else {
-			memcpy(target, data, PAGE_CACHE_SIZE);
-			bytes_read -= PAGE_CACHE_SIZE;
+			/* no need to hold page hostage */
+			lru_cache_add_file(page);
+			unlock_page(page);
+			page_cache_release(page);
+			rdata->pages[i] = NULL;
+			rdata->nr_pages--;
+			continue;
 		}
-		kunmap_atomic(target, KM_USER0);
 
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-		unlock_page(page);
-		data += PAGE_CACHE_SIZE;
+		result = cifs_readv_from_socket(server, &iov, 1, iov.iov_len);
+		kunmap(page);
+		if (result < 0)
+			break;
 
-		/* add page to FS-Cache */
-		cifs_readpage_to_fscache(mapping->host, page);
+		total_read += result;
 	}
-	return;
+
+	return total_read > 0 ? total_read : result;
 }
 
 static int cifs_readpages(struct file *file, struct address_space *mapping,
 	struct list_head *page_list, unsigned num_pages)
 {
-	int rc = -EACCES;
-	int xid;
-	loff_t offset;
-	struct page *page;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	unsigned int bytes_read = 0;
-	unsigned int read_size, i;
-	char *smb_read_data = NULL;
-	struct smb_com_read_rsp *pSMBr;
-	struct cifsFileInfo *open_file;
-	int buf_type = CIFS_NO_BUFFER;
+	int rc;
+	struct list_head tmplist;
+	struct cifsFileInfo *open_file = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	unsigned int rsize = cifs_sb->rsize;
+	pid_t pid;
 
-	xid = GetXid();
-	if (file->private_data == NULL) {
-		rc = -EBADF;
-		FreeXid(xid);
-		return rc;
-	}
-	open_file = file->private_data;
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = tlink_tcon(open_file->tlink);
+	/*
+	 * Give up immediately if rsize is too small to read an entire page.
+	 * The VFS will fall back to readpage. We should never reach this
+	 * point however since we set ra_pages to 0 when the rsize is smaller
+	 * than a cache page.
+	 */
+	if (unlikely(rsize < PAGE_CACHE_SIZE))
+		return 0;
 
 	/*
 	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
 	 * immediately if the cookie is negative
+	 *
+	 * After this point, every page in the list might have PG_fscache set,
+	 * so we will need to clean that up off of every page we don't use.
 	 */
 	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
 					 &num_pages);
 	if (rc == 0)
-		goto read_complete;
+		return rc;
 
-	cFYI(DBG2, "rpages: num pages %d", num_pages);
-	for (i = 0; i < num_pages; ) {
-		unsigned contig_pages;
-		struct page *tmp_page;
-		unsigned long expected_index;
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = open_file->pid;
+	else
+		pid = current->tgid;
 
-		if (list_empty(page_list))
-			break;
+	rc = 0;
+	INIT_LIST_HEAD(&tmplist);
+
+	cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
+		 __func__, file, mapping, num_pages);
+
+	/*
+	 * Start with the page at end of list and move it to private
+	 * list. Do the same with any following pages until we hit
+	 * the rsize limit, hit an index discontinuity, or run out of
+	 * pages. Issue the async read and then start the loop again
+	 * until the list is empty.
+	 *
+	 * Note that list order is important. The page_list is in
+	 * the order of declining indexes. When we put the pages in
+	 * the rdata->pages, then we want them in increasing order.
+	 */
+	while (!list_empty(page_list)) {
+		unsigned int i;
+		unsigned int bytes = PAGE_CACHE_SIZE;
+		unsigned int expected_index;
+		unsigned int nr_pages = 1;
+		loff_t offset;
+		struct page *page, *tpage;
+		struct cifs_readdata *rdata;
 
 		page = list_entry(page_list->prev, struct page, lru);
+
+		/*
+		 * Lock the page and put it in the cache. Since no one else
+		 * should have access to this page, we're safe to simply set
+		 * PG_locked without checking it first.
+		 */
+		__set_page_locked(page);
+		rc = add_to_page_cache_locked(page, mapping,
+					      page->index, GFP_KERNEL);
+
+		/* give up if we can't stick it in the cache */
+		if (rc) {
+			__clear_page_locked(page);
+			break;
+		}
+
+		/* move first page to the tmplist */
 		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+		list_move_tail(&page->lru, &tmplist);
 
-		/* count adjacent pages that we will read into */
-		contig_pages = 0;
-		expected_index =
-			list_entry(page_list->prev, struct page, lru)->index;
-		list_for_each_entry_reverse(tmp_page, page_list, lru) {
-			if (tmp_page->index == expected_index) {
-				contig_pages++;
-				expected_index++;
-			} else
+		/* now try and add more pages onto the request */
+		expected_index = page->index + 1;
+		list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+			/* discontinuity ? */
+			if (page->index != expected_index)
 				break;
-		}
-		if (contig_pages + i >  num_pages)
-			contig_pages = num_pages - i;
-
-		/* for reads over a certain size could initiate async
-		   read ahead */
-
-		read_size = contig_pages * PAGE_CACHE_SIZE;
-		/* Read size needs to be in multiples of one page */
-		read_size = min_t(const unsigned int, read_size,
-				  cifs_sb->rsize & PAGE_CACHE_MASK);
-		cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
-				read_size, contig_pages);
-		rc = -EAGAIN;
-		while (rc == -EAGAIN) {
-			if (open_file->invalidHandle) {
-				rc = cifs_reopen_file(open_file, true);
-				if (rc != 0)
-					break;
-			}
 
-			rc = CIFSSMBRead(xid, pTcon,
-					 open_file->netfid,
-					 read_size, offset,
-					 &bytes_read, &smb_read_data,
-					 &buf_type);
-			/* BB more RC checks ? */
-			if (rc == -EAGAIN) {
-				if (smb_read_data) {
-					if (buf_type == CIFS_SMALL_BUFFER)
-						cifs_small_buf_release(smb_read_data);
-					else if (buf_type == CIFS_LARGE_BUFFER)
-						cifs_buf_release(smb_read_data);
-					smb_read_data = NULL;
-				}
+			/* would this page push the read over the rsize? */
+			if (bytes + PAGE_CACHE_SIZE > rsize)
+				break;
+
+			__set_page_locked(page);
+			if (add_to_page_cache_locked(page, mapping,
+						page->index, GFP_KERNEL)) {
+				__clear_page_locked(page);
+				break;
 			}
+			list_move_tail(&page->lru, &tmplist);
+			bytes += PAGE_CACHE_SIZE;
+			expected_index++;
+			nr_pages++;
 		}
-		if ((rc < 0) || (smb_read_data == NULL)) {
-			cFYI(1, "Read error in readpages: %d", rc);
-			break;
-		} else if (bytes_read > 0) {
-			task_io_account_read(bytes_read);
-			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-			cifs_copy_cache_pages(mapping, page_list, bytes_read,
-				smb_read_data + 4 /* RFC1001 hdr */ +
-				le16_to_cpu(pSMBr->DataOffset));
-
-			i +=  bytes_read >> PAGE_CACHE_SHIFT;
-			cifs_stats_bytes_read(pTcon, bytes_read);
-			if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
-				i++; /* account for partial page */
-
-				/* server copy of file can have smaller size
-				   than client */
-				/* BB do we need to verify this common case ?
-				   this case is ok - if we are at server EOF
-				   we will hit it on next read */
-
-				/* break; */
+
+		rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
+		if (!rdata) {
+			/* best to give up if we're out of mem */
+			list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+				list_del(&page->lru);
+				lru_cache_add_file(page);
+				unlock_page(page);
+				page_cache_release(page);
 			}
-		} else {
-			cFYI(1, "No bytes read (%d) at offset %lld . "
-				"Cleaning remaining pages from readahead list",
-				bytes_read, offset);
-			/* BB turn off caching and do new lookup on
-			   file size at server? */
+			rc = -ENOMEM;
 			break;
 		}
-		if (smb_read_data) {
-			if (buf_type == CIFS_SMALL_BUFFER)
-				cifs_small_buf_release(smb_read_data);
-			else if (buf_type == CIFS_LARGE_BUFFER)
-				cifs_buf_release(smb_read_data);
-			smb_read_data = NULL;
+
+		rdata->cfile = cifsFileInfo_get(open_file);
+		rdata->mapping = mapping;
+		rdata->offset = offset;
+		rdata->bytes = bytes;
+		rdata->pid = pid;
+		rdata->pagesz = PAGE_CACHE_SIZE;
+		rdata->read_into_pages = cifs_readpages_read_into_pages;
+
+		list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+			list_del(&page->lru);
+			rdata->pages[rdata->nr_pages++] = page;
+		}
+
+		rc = cifs_retry_async_readv(rdata);
+		if (rc != 0) {
+			for (i = 0; i < rdata->nr_pages; i++) {
+				page = rdata->pages[i];
+				lru_cache_add_file(page);
+				unlock_page(page);
+				page_cache_release(page);
+			}
+			kref_put(&rdata->refcount, cifs_readdata_release);
+			break;
 		}
-		bytes_read = 0;
-	}
 
-/* need to free smb_read_data buf before exit */
-	if (smb_read_data) {
-		if (buf_type == CIFS_SMALL_BUFFER)
-			cifs_small_buf_release(smb_read_data);
-		else if (buf_type == CIFS_LARGE_BUFFER)
-			cifs_buf_release(smb_read_data);
-		smb_read_data = NULL;
+		kref_put(&rdata->refcount, cifs_readdata_release);
 	}
 
-read_complete:
-	FreeXid(xid);
+	/* Any pages that have been shown to fscache but didn't get added to
+	 * the pagecache must be uncached before they get returned to the
+	 * allocator.
+	 */
+	cifs_fscache_readpages_cancel(mapping->host, page_list);
 	return rc;
 }
 
+/*
+ * cifs_readpage_worker must be called with the page pinned
+ */
 static int cifs_readpage_worker(struct file *file, struct page *page,
 	loff_t *poffset)
 {
@@ -2244,11 +3399,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	int rc;
 
 	/* Is the page cached? */
-	rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+	rc = cifs_readpage_from_fscache(file_inode(file), page);
 	if (rc == 0)
 		goto read_complete;
 
-	page_cache_get(page);
 	read_data = kmap(page);
 	/* for reads over a certain size could initiate async read ahead */
 
@@ -2257,10 +3411,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	if (rc < 0)
 		goto io_error;
 	else
-		cFYI(1, "Bytes read %d", rc);
+		cifs_dbg(FYI, "Bytes read %d\n", rc);
 
-	file->f_path.dentry->d_inode->i_atime =
-		current_fs_time(file->f_path.dentry->d_inode->i_sb);
+	file_inode(file)->i_atime =
+		current_fs_time(file_inode(file)->i_sb);
 
 	if (PAGE_CACHE_SIZE > rc)
 		memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
@@ -2269,13 +3423,13 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	SetPageUptodate(page);
 
 	/* send this page to the cache */
-	cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
+	cifs_readpage_to_fscache(file_inode(file), page);
 
 	rc = 0;
 
 io_error:
 	kunmap(page);
-	page_cache_release(page);
+	unlock_page(page);
 
 read_complete:
 	return rc;
@@ -2285,24 +3439,22 @@ static int cifs_readpage(struct file *file, struct page *page)
 {
 	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int rc = -EACCES;
-	int xid;
+	unsigned int xid;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
-		FreeXid(xid);
+		free_xid(xid);
 		return rc;
 	}
 
-	cFYI(1, "readpage %p at offset %d 0x%x\n",
+	cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n",
 		 page, (int)offset, (int)offset);
 
 	rc = cifs_readpage_worker(file, page, &offset);
 
-	unlock_page(page);
-
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
@@ -2355,6 +3507,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int oncethru = 0;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
 	loff_t page_start = pos & PAGE_MASK;
@@ -2362,8 +3515,9 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	int rc = 0;
 
-	cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
+	cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
 
+start:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		rc = -ENOMEM;
@@ -2387,7 +3541,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 	 * is, when the page lies beyond the EOF, or straddles the EOF
 	 * and the write will cover all of the existing data.
 	 */
-	if (CIFS_I(mapping->host)->clientCanCacheRead) {
+	if (CIFS_CACHE_READ(CIFS_I(mapping->host))) {
 		i_size = i_size_read(mapping->host);
 		if (page_start >= i_size ||
 		    (offset == 0 && (pos + len) >= i_size)) {
@@ -2405,13 +3559,16 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 		}
 	}
 
-	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) {
 		/*
 		 * might as well read a page, it is fast enough. If we get
 		 * an error, we don't need to return it. cifs_write_end will
 		 * do a sync write instead since PG_uptodate isn't set.
 		 */
 		cifs_readpage_worker(file, page, &page_start);
+		page_cache_release(page);
+		oncethru = 1;
+		goto start;
 	} else {
 		/* we could try using another file handle if there is one -
 		   but how would we lock it to prevent close of that handle
@@ -2431,36 +3588,84 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
 	return cifs_fscache_release_page(page, gfp);
 }
 
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
 
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
 
+static int cifs_launder_page(struct page *page)
+{
+	int rc = 0;
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+
+	cifs_dbg(FYI, "Launder page: %p\n", page);
+
+	if (clear_page_dirty_for_io(page))
+		rc = cifs_writepage_locked(page, &wbc);
+
+	cifs_fscache_invalidate_page(page, page->mapping->host);
+	return rc;
+}
+
+static int
+cifs_pending_writers_wait(void *unused)
+{
+	schedule();
+	return 0;
+}
+
 void cifs_oplock_break(struct work_struct *work)
 {
 	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
 						  oplock_break);
 	struct inode *inode = cfile->dentry->d_inode;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
 	int rc = 0;
 
+	wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
+			cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
+
+	server->ops->downgrade_oplock(server, cinode,
+		test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+
+	if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
+						cifs_has_mand_locks(cinode)) {
+		cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
+			 inode);
+		cinode->oplock = 0;
+	}
+
 	if (inode && S_ISREG(inode->i_mode)) {
-		if (cinode->clientCanCacheRead)
+		if (CIFS_CACHE_READ(cinode))
 			break_lease(inode, O_RDONLY);
 		else
 			break_lease(inode, O_WRONLY);
 		rc = filemap_fdatawrite(inode->i_mapping);
-		if (cinode->clientCanCacheRead == 0) {
+		if (!CIFS_CACHE_READ(cinode)) {
 			rc = filemap_fdatawait(inode->i_mapping);
 			mapping_set_error(inode->i_mapping, rc);
-			invalidate_remote_inode(inode);
+			cifs_zap_mapping(inode);
 		}
-		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
+		cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
 	}
 
+	rc = cifs_push_locks(cfile);
+	if (rc)
+		cifs_dbg(VFS, "Push locks rc = %d\n", rc);
+
 	/*
 	 * releasing stale oplock after recent reconnect of smb session using
 	 * a now incorrect file handle is not a data integrity issue but do
@@ -2468,37 +3673,33 @@ void cifs_oplock_break(struct work_struct *work)
 	 * disconnected since oplock already released by the server
 	 */
 	if (!cfile->oplock_break_cancelled) {
-		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
-				 cinode->clientCanCacheRead ? 1 : 0);
-		cFYI(1, "Oplock release rc = %d", rc);
+		rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
+							     cinode);
+		cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
 	}
-
-	/*
-	 * We might have kicked in before is_valid_oplock_break()
-	 * finished grabbing reference for us.  Make sure it's done by
-	 * waiting for cifs_file_list_lock.
-	 */
-	spin_lock(&cifs_file_list_lock);
-	spin_unlock(&cifs_file_list_lock);
-
-	cifs_oplock_break_put(cfile);
+	cifs_done_oplock_break(cinode);
 }
 
-/* must be called while holding cifs_file_list_lock */
-void cifs_oplock_break_get(struct cifsFileInfo *cfile)
+/*
+ * The presence of cifs_direct_io() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
+ * so this method should never be called.
+ *
+ * Direct IO is not yet supported in the cached mode. 
+ */
+static ssize_t
+cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
+               loff_t pos)
 {
-	cifs_sb_active(cfile->dentry->d_sb);
-	cifsFileInfo_get(cfile);
+        /*
+         * FIXME
+         * Eventually need to support direct IO for non forcedirectio mounts
+         */
+        return -EINVAL;
 }
 
-void cifs_oplock_break_put(struct cifsFileInfo *cfile)
-{
-	struct super_block *sb = cfile->dentry->d_sb;
-
-	cifsFileInfo_put(cfile);
-	cifs_sb_deactive(sb);
-}
 
 const struct address_space_operations cifs_addr_ops = {
 	.readpage = cifs_readpage,
@@ -2509,9 +3710,9 @@ const struct address_space_operations cifs_addr_ops = {
 	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
+	.direct_IO = cifs_direct_io,
 	.invalidatepage = cifs_invalidate_page,
-	/* .sync_page = cifs_sync_page, */
-	/* .direct_IO = */
+	.launder_page = cifs_launder_page,
 };
 
 /*
@@ -2528,6 +3729,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .sync_page = cifs_sync_page, */
-	/* .direct_IO = */
+	.launder_page = cifs_launder_page,
 };
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 297a43d0ff7..8d4b7bc8ae9 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -27,33 +27,33 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
 {
 	server->fscache =
 		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
-				&cifs_fscache_server_index_def, server);
-	cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
-				server->fscache);
+				&cifs_fscache_server_index_def, server, true);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server, server->fscache);
 }
 
 void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
 {
-	cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
-				server->fscache);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server, server->fscache);
 	fscache_relinquish_cookie(server->fscache, 0);
 	server->fscache = NULL;
 }
 
-void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 {
 	struct TCP_Server_Info *server = tcon->ses->server;
 
 	tcon->fscache =
 		fscache_acquire_cookie(server->fscache,
-				&cifs_fscache_super_index_def, tcon);
-	cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
-				server->fscache, tcon->fscache);
+				&cifs_fscache_super_index_def, tcon, true);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server->fscache, tcon->fscache);
 }
 
-void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
 {
-	cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
+	cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
 	fscache_relinquish_cookie(tcon->fscache, 0);
 	tcon->fscache = NULL;
 }
@@ -62,16 +62,16 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (cifsi->fscache)
 		return;
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
 		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
-				&cifs_fscache_inode_object_def, cifsi);
-		cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
-				cifsi->fscache);
+				&cifs_fscache_inode_object_def, cifsi, true);
+		cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
+			 __func__, tcon->fscache, cifsi->fscache);
 	}
 }
 
@@ -80,8 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
 	if (cifsi->fscache) {
-		cFYI(1, "CIFS releasing inode cookie (0x%p)",
-				cifsi->fscache);
+		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
 		fscache_relinquish_cookie(cifsi->fscache, 0);
 		cifsi->fscache = NULL;
 	}
@@ -92,8 +91,8 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
 	if (cifsi->fscache) {
-		cFYI(1, "CIFS disabling inode cookie (0x%p)",
-				cifsi->fscache);
+		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
+		fscache_uncache_all_inode_pages(cifsi->fscache, inode);
 		fscache_relinquish_cookie(cifsi->fscache, 1);
 		cifsi->fscache = NULL;
 	}
@@ -120,9 +119,9 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 		cifsi->fscache = fscache_acquire_cookie(
 					cifs_sb_master_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
-					cifsi);
-		cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
-				cifsi->fscache, old);
+					cifsi, true);
+		cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
+			 __func__, cifsi->fscache, old);
 	}
 }
 
@@ -132,8 +131,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
 		struct inode *inode = page->mapping->host;
 		struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
-		cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
-				page, cifsi->fscache);
+		cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+			 __func__, page, cifsi->fscache);
 		if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
 			return 0;
 	}
@@ -144,8 +143,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
 static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
 						int error)
 {
-	cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
-			page, error);
+	cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error);
 	if (!error)
 		SetPageUptodate(page);
 	unlock_page(page);
@@ -158,8 +156,8 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
 	int ret;
 
-	cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
-			CIFS_I(inode)->fscache, page, inode);
+	cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
+		 __func__, CIFS_I(inode)->fscache, page, inode);
 	ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
 					 cifs_readpage_from_fscache_complete,
 					 NULL,
@@ -167,15 +165,15 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
 	switch (ret) {
 
 	case 0: /* page found in fscache, read submitted */
-		cFYI(1, "CIFS: readpage_from_fscache: submitted");
+		cifs_dbg(FYI, "%s: submitted\n", __func__);
 		return ret;
 	case -ENOBUFS:	/* page won't be cached */
 	case -ENODATA:	/* page not in cache */
-		cFYI(1, "CIFS: readpage_from_fscache %d", ret);
+		cifs_dbg(FYI, "%s: %d\n", __func__, ret);
 		return 1;
 
 	default:
-		cERROR(1, "unknown error ret = %d", ret);
+		cifs_dbg(VFS, "unknown error ret = %d\n", ret);
 	}
 	return ret;
 }
@@ -190,8 +188,8 @@ int __cifs_readpages_from_fscache(struct inode *inode,
 {
 	int ret;
 
-	cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
-			CIFS_I(inode)->fscache, *nr_pages, inode);
+	cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n",
+		 __func__, CIFS_I(inode)->fscache, *nr_pages, inode);
 	ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
 					  pages, nr_pages,
 					  cifs_readpage_from_fscache_complete,
@@ -199,16 +197,16 @@ int __cifs_readpages_from_fscache(struct inode *inode,
 					  mapping_gfp_mask(mapping));
 	switch (ret) {
 	case 0:	/* read submitted to the cache for all pages */
-		cFYI(1, "CIFS: readpages_from_fscache: submitted");
+		cifs_dbg(FYI, "%s: submitted\n", __func__);
 		return ret;
 
 	case -ENOBUFS:	/* some pages are not cached and can't be */
 	case -ENODATA:	/* some pages are not cached */
-		cFYI(1, "CIFS: readpages_from_fscache: no page");
+		cifs_dbg(FYI, "%s: no page\n", __func__);
 		return 1;
 
 	default:
-		cFYI(1, "unknown error ret = %d", ret);
+		cifs_dbg(FYI, "unknown error ret = %d\n", ret);
 	}
 
 	return ret;
@@ -218,19 +216,26 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
 	int ret;
 
-	cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
-			CIFS_I(inode)->fscache, page, inode);
+	cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
+		 __func__, CIFS_I(inode)->fscache, page, inode);
 	ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
 	if (ret != 0)
 		fscache_uncache_page(CIFS_I(inode)->fscache, page);
 }
 
+void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
+{
+	cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
+		 __func__, CIFS_I(inode)->fscache, inode);
+	fscache_readpages_cancel(CIFS_I(inode)->fscache, pages);
+}
+
 void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct fscache_cookie *cookie = cifsi->fscache;
 
-	cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
 	fscache_wait_on_page_write(cookie, page);
 	fscache_uncache_page(cookie, page);
 }
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 31b88ec2341..24794b6cd8e 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -40,8 +40,8 @@ extern void cifs_fscache_unregister(void);
  */
 extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
 extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
-extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
-extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
+extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
 
 extern void cifs_fscache_release_inode_cookie(struct inode *);
 extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
@@ -54,6 +54,7 @@ extern int __cifs_readpages_from_fscache(struct inode *,
 					 struct address_space *,
 					 struct list_head *,
 					 unsigned *);
+extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *);
 
 extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
 
@@ -91,6 +92,13 @@ static inline void cifs_readpage_to_fscache(struct inode *inode,
 		__cifs_readpage_to_fscache(inode, page);
 }
 
+static inline void cifs_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_fscache_readpages_cancel(inode, pages);
+}
+
 #else /* CONFIG_CIFS_FSCACHE */
 static inline int cifs_fscache_register(void) { return 0; }
 static inline void cifs_fscache_unregister(void) {}
@@ -99,9 +107,9 @@ static inline void
 cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
 static inline void
 cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
-static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {}
 static inline void
-cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
 
 static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
 static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
@@ -131,6 +139,11 @@ static inline int cifs_readpages_from_fscache(struct inode *inode,
 static inline void cifs_readpage_to_fscache(struct inode *inode,
 			struct page *page) {}
 
+static inline void cifs_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+}
+
 #endif /* CONFIG_CIFS_FSCACHE */
 
 #endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8852470b4fb..a174605f6af 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/freezer.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -91,31 +92,60 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
-	cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
+	cifs_dbg(FYI, "%s: revalidating inode %llu\n",
+		 __func__, cifs_i->uniqueid);
 
 	if (inode->i_state & I_NEW) {
-		cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
+		cifs_dbg(FYI, "%s: inode %llu is new\n",
+			 __func__, cifs_i->uniqueid);
 		return;
 	}
 
 	/* don't bother with revalidation if we have an oplock */
-	if (cifs_i->clientCanCacheRead) {
-		cFYI(1, "%s: inode %llu is oplocked", __func__,
-			 cifs_i->uniqueid);
+	if (CIFS_CACHE_READ(cifs_i)) {
+		cifs_dbg(FYI, "%s: inode %llu is oplocked\n",
+			 __func__, cifs_i->uniqueid);
 		return;
 	}
 
 	 /* revalidate if mtime or size have changed */
 	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
 	    cifs_i->server_eof == fattr->cf_eof) {
-		cFYI(1, "%s: inode %llu is unchanged", __func__,
-			 cifs_i->uniqueid);
+		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
+			 __func__, cifs_i->uniqueid);
 		return;
 	}
 
-	cFYI(1, "%s: invalidating inode %llu mapping", __func__,
-		 cifs_i->uniqueid);
-	cifs_i->invalid_mapping = true;
+	cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
+		 __func__, cifs_i->uniqueid);
+	set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
+}
+
+/*
+ * copy nlink to the inode, unless it wasn't provided.  Provide
+ * sane values if we don't have an existing one and none was provided
+ */
+static void
+cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+{
+	/*
+	 * if we're in a situation where we can't trust what we
+	 * got from the server (readdir, some non-unix cases)
+	 * fake reasonable values
+	 */
+	if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
+		/* only provide fake values on a new inode */
+		if (inode->i_state & I_NEW) {
+			if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
+				set_nlink(inode, 2);
+			else
+				set_nlink(inode, 1);
+		}
+		return;
+	}
+
+	/* we trust the server, so update it */
+	set_nlink(inode, fattr->cf_nlink);
 }
 
 /* populate an inode with info from a cifs_fattr struct */
@@ -124,15 +154,15 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	unsigned long oldtime = cifs_i->time;
 
 	cifs_revalidate_cache(inode, fattr);
 
+	spin_lock(&inode->i_lock);
 	inode->i_atime = fattr->cf_atime;
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
 	inode->i_rdev = fattr->cf_rdev;
-	inode->i_nlink = fattr->cf_nlink;
+	cifs_nlink_fattr_to_inode(inode, fattr);
 	inode->i_uid = fattr->cf_uid;
 	inode->i_gid = fattr->cf_gid;
 
@@ -148,17 +178,16 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	else
 		cifs_i->time = jiffies;
 
-	cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
-		 oldtime, cifs_i->time);
-
-	cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
+	if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)
+		set_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
+	else
+		clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
 
 	cifs_i->server_eof = fattr->cf_eof;
 	/*
 	 * Can't safely change the file size here if the client is writing to
 	 * it due to potential races.
 	 */
-	spin_lock(&inode->i_lock);
 	if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
 		i_size_write(inode, fattr->cf_eof);
 
@@ -173,7 +202,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 
 	if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
 		inode->i_flags |= S_AUTOMOUNT;
-	cifs_set_ops(inode);
+	if (inode->i_state & I_NEW)
+		cifs_set_ops(inode);
 }
 
 void
@@ -244,19 +274,29 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 		/* safest to call it a file if we do not know */
 		fattr->cf_mode |= S_IFREG;
 		fattr->cf_dtype = DT_REG;
-		cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
+		cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type));
 		break;
 	}
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-		fattr->cf_uid = cifs_sb->mnt_uid;
-	else
-		fattr->cf_uid = le64_to_cpu(info->Uid);
-
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-		fattr->cf_gid = cifs_sb->mnt_gid;
-	else
-		fattr->cf_gid = le64_to_cpu(info->Gid);
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) {
+		u64 id = le64_to_cpu(info->Uid);
+		if (id < ((uid_t)-1)) {
+			kuid_t uid = make_kuid(&init_user_ns, id);
+			if (uid_valid(uid))
+				fattr->cf_uid = uid;
+		}
+	}
+	
+	fattr->cf_gid = cifs_sb->mnt_gid;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) {
+		u64 id = le64_to_cpu(info->Gid);
+		if (id < ((gid_t)-1)) {
+			kgid_t gid = make_kgid(&init_user_ns, id);
+			if (gid_valid(gid))
+				fattr->cf_gid = gid;
+		}
+	}
 
 	fattr->cf_nlink = le64_to_cpu(info->Nlinks);
 }
@@ -273,7 +313,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, "creating fake fattr for DFS referral");
+	cifs_dbg(FYI, "creating fake fattr for DFS referral\n");
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -286,19 +326,20 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
 
-int cifs_get_file_info_unix(struct file *filp)
+static int
+cifs_get_file_info_unix(struct file *filp)
 {
 	int rc;
-	int xid;
+	unsigned int xid;
 	FILE_UNIX_BASIC_INFO find_data;
 	struct cifs_fattr fattr;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = filp->private_data;
-	struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 
-	xid = GetXid();
-	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	xid = get_xid();
+	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data);
 	if (!rc) {
 		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
 	} else if (rc == -EREMOTE) {
@@ -307,22 +348,22 @@ int cifs_get_file_info_unix(struct file *filp)
 	}
 
 	cifs_fattr_to_inode(inode, &fattr);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
 int cifs_get_inode_info_unix(struct inode **pinode,
 			     const unsigned char *full_path,
-			     struct super_block *sb, int xid)
+			     struct super_block *sb, unsigned int xid)
 {
 	int rc;
 	FILE_UNIX_BASIC_INFO find_data;
 	struct cifs_fattr fattr;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, "Getting info on %s", full_path);
+	cifs_dbg(FYI, "Getting info on %s\n", full_path);
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
@@ -346,9 +387,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* check for Minshall+French symlinks */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+					     full_path);
 		if (tmprc)
-			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+			cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
 	}
 
 	if (*pinode == NULL) {
@@ -366,17 +408,20 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 }
 
 static int
-cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
-	      struct cifs_sb_info *cifs_sb, int xid)
+cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
+	      struct cifs_sb_info *cifs_sb, unsigned int xid)
 {
 	int rc;
 	int oplock = 0;
-	__u16 netfid;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
 	char buf[24];
 	unsigned int bytes_read;
 	char *pbuf;
+	int buf_type = CIFS_NO_BUFFER;
 
 	pbuf = buf;
 
@@ -397,58 +442,69 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
-	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
-			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-			 cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc == 0) {
-		int buf_type = CIFS_NO_BUFFER;
-			/* Read header */
-		rc = CIFSSMBRead(xid, tcon, netfid,
-				 24 /* length */, 0 /* offset */,
-				 &bytes_read, &pbuf, &buf_type);
-		if ((rc == 0) && (bytes_read >= 8)) {
-			if (memcmp("IntxBLK", pbuf, 8) == 0) {
-				cFYI(1, "Block device");
-				fattr->cf_mode |= S_IFBLK;
-				fattr->cf_dtype = DT_BLK;
-				if (bytes_read == 24) {
-					/* we have enough to decode dev num */
-					__u64 mjr; /* major */
-					__u64 mnr; /* minor */
-					mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
-					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-					fattr->cf_rdev = MKDEV(mjr, mnr);
-				}
-			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-				cFYI(1, "Char device");
-				fattr->cf_mode |= S_IFCHR;
-				fattr->cf_dtype = DT_CHR;
-				if (bytes_read == 24) {
-					/* we have enough to decode dev num */
-					__u64 mjr; /* major */
-					__u64 mnr; /* minor */
-					mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
-					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
-					fattr->cf_rdev = MKDEV(mjr, mnr);
-				}
-			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-				cFYI(1, "Symlink");
-				fattr->cf_mode |= S_IFLNK;
-				fattr->cf_dtype = DT_LNK;
-			} else {
-				fattr->cf_mode |= S_IFREG; /* file? */
-				fattr->cf_dtype = DT_REG;
-				rc = -EOPNOTSUPP;
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc) {
+		cifs_put_tlink(tlink);
+		return rc;
+	}
+
+	/* Read header */
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = 24;
+
+	rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+	if ((rc == 0) && (bytes_read >= 8)) {
+		if (memcmp("IntxBLK", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "Block device\n");
+			fattr->cf_mode |= S_IFBLK;
+			fattr->cf_dtype = DT_BLK;
+			if (bytes_read == 24) {
+				/* we have enough to decode dev num */
+				__u64 mjr; /* major */
+				__u64 mnr; /* minor */
+				mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+				mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+				fattr->cf_rdev = MKDEV(mjr, mnr);
+			}
+		} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "Char device\n");
+			fattr->cf_mode |= S_IFCHR;
+			fattr->cf_dtype = DT_CHR;
+			if (bytes_read == 24) {
+				/* we have enough to decode dev num */
+				__u64 mjr; /* major */
+				__u64 mnr; /* minor */
+				mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+				mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+				fattr->cf_rdev = MKDEV(mjr, mnr);
 			}
+		} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+			cifs_dbg(FYI, "Symlink\n");
+			fattr->cf_mode |= S_IFLNK;
+			fattr->cf_dtype = DT_LNK;
 		} else {
-			fattr->cf_mode |= S_IFREG; /* then it is a file */
+			fattr->cf_mode |= S_IFREG; /* file? */
 			fattr->cf_dtype = DT_REG;
-			rc = -EOPNOTSUPP; /* or some unknown SFU type */
+			rc = -EOPNOTSUPP;
 		}
-		CIFSSMBClose(xid, tcon, netfid);
+	} else {
+		fattr->cf_mode |= S_IFREG; /* then it is a file */
+		fattr->cf_dtype = DT_REG;
+		rc = -EOPNOTSUPP; /* or some unknown SFU type */
 	}
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -461,34 +517,39 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
  * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ?
  */
 static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
-			 struct cifs_sb_info *cifs_sb, int xid)
+			 struct cifs_sb_info *cifs_sb, unsigned int xid)
 {
 #ifdef CONFIG_CIFS_XATTR
 	ssize_t rc;
 	char ea_value[4];
 	__u32 mode;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
-	rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
-			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
-			    cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (tcon->ses->server->ops->query_all_EAs == NULL) {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
+
+	rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
+			"SETFILEBITS", ea_value, 4 /* size of buf */,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	cifs_put_tlink(tlink);
 	if (rc < 0)
 		return (int)rc;
 	else if (rc > 3) {
 		mode = le32_to_cpu(*((__le32 *)ea_value));
 		fattr->cf_mode &= ~SFBITS_MASK;
-		cFYI(1, "special bits 0%o org mode 0%o", mode,
-			 fattr->cf_mode);
+		cifs_dbg(FYI, "special bits 0%o org mode 0%o\n",
+			 mode, fattr->cf_mode);
 		fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-		cFYI(1, "special mode bits 0%o", mode);
+		cifs_dbg(FYI, "special mode bits 0%o\n", mode);
 	}
 
 	return 0;
@@ -500,9 +561,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
 static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
-		       struct cifs_sb_info *cifs_sb, bool adjust_tz)
+		       struct cifs_sb_info *cifs_sb, bool adjust_tz,
+		       bool symlink)
 {
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
@@ -526,9 +588,20 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
 	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 
-	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+
+	if (symlink) {
+		fattr->cf_mode = S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+	} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
 		fattr->cf_dtype = DT_DIR;
+		/*
+		 * Server can return wrong NumberOfLinks value for directories
+		 * when Unix extensions are disabled - fake it.
+		 */
+		if (!tcon->unix_ext)
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
@@ -536,28 +609,52 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		/* clear write bits if ATTR_READONLY is set */
 		if (fattr->cf_cifsattrs & ATTR_READONLY)
 			fattr->cf_mode &= ~(S_IWUGO);
-	}
 
-	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+		/*
+		 * Don't accept zero nlink from non-unix servers unless
+		 * delete is pending.  Instead mark it as unknown.
+		 */
+		if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
+		    !info->DeletePending) {
+			cifs_dbg(1, "bogus file nlink value %u\n",
+				fattr->cf_nlink);
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+		}
+	}
 
 	fattr->cf_uid = cifs_sb->mnt_uid;
 	fattr->cf_gid = cifs_sb->mnt_gid;
 }
 
-int cifs_get_file_info(struct file *filp)
+static int
+cifs_get_file_info(struct file *filp)
 {
 	int rc;
-	int xid;
+	unsigned int xid;
 	FILE_ALL_INFO find_data;
 	struct cifs_fattr fattr;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = filp->private_data;
-	struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
-
-	xid = GetXid();
-	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
-	if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	if (!server->ops->query_file_info)
+		return -ENOSYS;
+
+	xid = get_xid();
+	rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
+	switch (rc) {
+	case 0:
+		cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+				       false);
+		break;
+	case -EREMOTE:
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+		break;
+	case -EOPNOTSUPP:
+	case -EINVAL:
 		/*
 		 * FIXME: legacy server -- fall back to path-based call?
 		 * for now, just skip revalidating and mark inode for
@@ -565,87 +662,109 @@ int cifs_get_file_info(struct file *filp)
 		 */
 		rc = 0;
 		CIFS_I(inode)->time = 0;
+	default:
 		goto cgfi_exit;
-	} else if (rc == -EREMOTE) {
-		cifs_create_dfs_fattr(&fattr, inode->i_sb);
-		rc = 0;
-	} else if (rc)
-		goto cgfi_exit;
+	}
 
 	/*
 	 * don't bother with SFU junk here -- just mark inode as needing
 	 * revalidation.
 	 */
-	cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
 	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
 	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
 	cifs_fattr_to_inode(inode, &fattr);
 cgfi_exit:
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
-int cifs_get_inode_info(struct inode **pinode,
-	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
-	struct super_block *sb, int xid, const __u16 *pfid)
+int
+cifs_get_inode_info(struct inode **inode, const char *full_path,
+		    FILE_ALL_INFO *data, struct super_block *sb, int xid,
+		    const struct cifs_fid *fid)
 {
-	int rc = 0, tmprc;
-	struct cifsTconInfo *pTcon;
+	bool validinum = false;
+	__u16 srchflgs;
+	int rc = 0, tmprc = ENOSYS;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	char *buf = NULL;
-	bool adjustTZ = false;
+	bool adjust_tz = false;
 	struct cifs_fattr fattr;
+	struct cifs_search_info *srchinf = NULL;
+	bool symlink = false;
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
 
-	cFYI(1, "Getting info on %s", full_path);
+	cifs_dbg(FYI, "Getting info on %s\n", full_path);
 
-	if ((pfindData == NULL) && (*pinode != NULL)) {
-		if (CIFS_I(*pinode)->clientCanCacheRead) {
-			cFYI(1, "No need to revalidate cached inode sizes");
+	if ((data == NULL) && (*inode != NULL)) {
+		if (CIFS_CACHE_READ(CIFS_I(*inode))) {
+			cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
 			goto cgii_exit;
 		}
 	}
 
-	/* if file info not passed in then get it from server */
-	if (pfindData == NULL) {
+	/* if inode info is not passed, get it from server */
+	if (data == NULL) {
+		if (!server->ops->query_path_info) {
+			rc = -ENOSYS;
+			goto cgii_exit;
+		}
 		buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 		if (buf == NULL) {
 			rc = -ENOMEM;
 			goto cgii_exit;
 		}
-		pfindData = (FILE_ALL_INFO *)buf;
-
-		/* could do find first instead but this returns more info */
-		rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData,
-			      0 /* not legacy */,
-			      cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-		/* BB optimize code so we do not make the above call
-		when server claims no NT SMB support and the above call
-		failed at least once - set flag in tcon or mount */
-		if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
-			rc = SMBQueryInformation(xid, pTcon, full_path,
-					pfindData, cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-					  CIFS_MOUNT_MAP_SPECIAL_CHR);
-			adjustTZ = true;
-		}
+		data = (FILE_ALL_INFO *)buf;
+		rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path,
+						  data, &adjust_tz, &symlink);
 	}
 
 	if (!rc) {
-		cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData,
-				       cifs_sb, adjustTZ);
+		cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+				       symlink);
 	} else if (rc == -EREMOTE) {
 		cifs_create_dfs_fattr(&fattr, sb);
 		rc = 0;
-	} else {
+	} else if (rc == -EACCES && backup_cred(cifs_sb)) {
+			srchinf = kzalloc(sizeof(struct cifs_search_info),
+						GFP_KERNEL);
+			if (srchinf == NULL) {
+				rc = -ENOMEM;
+				goto cgii_exit;
+			}
+
+			srchinf->endOfSearch = false;
+			srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+
+			srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+					CIFS_SEARCH_CLOSE_AT_END |
+					CIFS_SEARCH_BACKUP_SEARCH;
+
+			rc = CIFSFindFirst(xid, tcon, full_path,
+				cifs_sb, NULL, srchflgs, srchinf, false);
+			if (!rc) {
+				data =
+				(FILE_ALL_INFO *)srchinf->srch_entries_start;
+
+				cifs_dir_info_to_fattr(&fattr,
+				(FILE_DIRECTORY_INFO *)data, cifs_sb);
+				fattr.cf_uniqueid = le64_to_cpu(
+				((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+				validinum = true;
+
+				cifs_buf_release(srchinf->ntwrk_buf_start);
+			}
+			kfree(srchinf);
+	} else
 		goto cgii_exit;
-	}
 
 	/*
 	 * If an inode wasn't passed in, then get the inode number
@@ -653,54 +772,41 @@ int cifs_get_inode_info(struct inode **pinode,
 	 * Is an i_ino of zero legal? Can we use that to check if the server
 	 * supports returning inode numbers?  Are there other sanity checks we
 	 * can use to ensure that the server is really filling in that field?
-	 *
-	 * We can not use the IndexNumber field by default from Windows or
-	 * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
-	 * CIFS spec claims that this value is unique within the scope of a
-	 * share, and the windows docs hint that it's actually unique
-	 * per-machine.
-	 *
-	 * There may be higher info levels that work but are there Windows
-	 * server or network appliances for which IndexNumber field is not
-	 * guaranteed unique?
 	 */
-	if (*pinode == NULL) {
+	if (*inode == NULL) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-			int rc1 = 0;
-
-			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					full_path, &fattr.cf_uniqueid,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (rc1 || !fattr.cf_uniqueid) {
-				cFYI(1, "GetSrvInodeNum rc %d", rc1);
-				fattr.cf_uniqueid = iunique(sb, ROOT_I);
-				cifs_autodisable_serverino(cifs_sb);
+			if (validinum == false) {
+				if (server->ops->get_srv_inum)
+					tmprc = server->ops->get_srv_inum(xid,
+						tcon, cifs_sb, full_path,
+						&fattr.cf_uniqueid, data);
+				if (tmprc) {
+					cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
+						 tmprc);
+					fattr.cf_uniqueid = iunique(sb, ROOT_I);
+					cifs_autodisable_serverino(cifs_sb);
+				}
 			}
-		} else {
+		} else
 			fattr.cf_uniqueid = iunique(sb, ROOT_I);
-		}
-	} else {
-		fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid;
-	}
+	} else
+		fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
 
 	/* query for SFU type info if supported and needed */
 	if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
 	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 		tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
 		if (tmprc)
-			cFYI(1, "cifs_sfu_type failed: %d", tmprc);
+			cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
 	}
 
 #ifdef CONFIG_CIFS_ACL
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
-						pfid);
+		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid);
 		if (rc) {
-			cFYI(1, "%s: Getting ACL failed with error: %d",
-				__func__, rc);
+			cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
+				 __func__, rc);
 			goto cgii_exit;
 		}
 	}
@@ -712,17 +818,18 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* check for Minshall+French symlinks */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+					 full_path);
 		if (tmprc)
-			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+			cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
 	}
 
-	if (!*pinode) {
-		*pinode = cifs_iget(sb, &fattr);
-		if (!*pinode)
+	if (!*inode) {
+		*inode = cifs_iget(sb, &fattr);
+		if (!*inode)
 			rc = -ENOMEM;
 	} else {
-		cifs_fattr_to_inode(*pinode, &fattr);
+		cifs_fattr_to_inode(*inode, &fattr);
 	}
 
 cgii_exit:
@@ -735,48 +842,6 @@ static const struct inode_operations cifs_ipc_inode_ops = {
 	.lookup = cifs_lookup,
 };
 
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
-				struct cifsTconInfo *tcon)
-{
-	int pplen = cifs_sb->prepathlen;
-	int dfsplen;
-	char *full_path = NULL;
-
-	/* if no prefix path, simply set path to the root of share to "" */
-	if (pplen == 0) {
-		full_path = kmalloc(1, GFP_KERNEL);
-		if (full_path)
-			full_path[0] = 0;
-		return full_path;
-	}
-
-	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
-	else
-		dfsplen = 0;
-
-	full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
-	if (full_path == NULL)
-		return full_path;
-
-	if (dfsplen) {
-		strncpy(full_path, tcon->treeName, dfsplen);
-		/* switch slash direction in prepath depending on whether
-		 * windows or posix style path names
-		 */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
-			int i;
-			for (i = 0; i < dfsplen; i++) {
-				if (full_path[i] == '\\')
-					full_path[i] = '/';
-			}
-		}
-	}
-	strncpy(full_path + dfsplen, cifs_sb->prepath, pplen);
-	full_path[dfsplen + pplen] = 0; /* add trailing null */
-	return full_path;
-}
-
 static int
 cifs_find_inode(struct inode *inode, void *opaque)
 {
@@ -795,7 +860,7 @@ cifs_find_inode(struct inode *inode, void *opaque)
 		return 0;
 
 	/* if it's not a directory or has no dentries, then flag it */
-	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
+	if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry))
 		fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
 
 	return 1;
@@ -822,7 +887,7 @@ inode_has_hashed_dentries(struct inode *inode)
 	struct dentry *dentry;
 
 	spin_lock(&inode->i_lock);
-	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
 			spin_unlock(&inode->i_lock);
 			return true;
@@ -840,7 +905,7 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
 	struct inode *inode;
 
 retry_iget5_locked:
-	cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
+	cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid);
 
 	/* hash down to 32-bits on 32-bit arch */
 	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -878,25 +943,19 @@ retry_iget5_locked:
 }
 
 /* gets root inode */
-struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb)
 {
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct inode *inode = NULL;
 	long rc;
-	char *full_path;
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-
-	full_path = cifs_build_path_to_root(cifs_sb, tcon);
-	if (full_path == NULL)
-		return ERR_PTR(-ENOMEM);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
-	xid = GetXid();
+	xid = get_xid();
 	if (tcon->unix_ext)
-		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+		rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
 	else
-		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
-						xid, NULL);
+		rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
 
 	if (!inode) {
 		inode = ERR_PTR(rc);
@@ -909,46 +968,44 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 #endif
 
 	if (rc && tcon->ipc) {
-		cFYI(1, "ipc connection - fake read inode");
+		cifs_dbg(FYI, "ipc connection - fake read inode\n");
+		spin_lock(&inode->i_lock);
 		inode->i_mode |= S_IFDIR;
-		inode->i_nlink = 2;
+		set_nlink(inode, 2);
 		inode->i_op = &cifs_ipc_inode_ops;
 		inode->i_fop = &simple_dir_operations;
 		inode->i_uid = cifs_sb->mnt_uid;
 		inode->i_gid = cifs_sb->mnt_gid;
+		spin_unlock(&inode->i_lock);
 	} else if (rc) {
 		iget_failed(inode);
 		inode = ERR_PTR(rc);
 	}
 
 out:
-	kfree(full_path);
-	/* can not call macro FreeXid here since in a void func
+	/* can not call macro free_xid here since in a void func
 	 * TODO: This is no longer true
 	 */
-	_FreeXid(xid);
+	_free_xid(xid);
 	return inode;
 }
 
-static int
-cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
-		    char *full_path, __u32 dosattr)
+int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
+		   char *full_path, __u32 dosattr)
 {
-	int rc;
-	int oplock = 0;
-	__u16 netfid;
-	__u32 netpid;
 	bool set_time = false;
-	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct tcon_link *tlink = NULL;
-	struct cifsTconInfo *pTcon;
+	struct TCP_Server_Info *server;
 	FILE_BASIC_INFO	info_buf;
 
 	if (attrs == NULL)
 		return -EINVAL;
 
+	server = cifs_sb_master_tcon(cifs_sb)->ses->server;
+	if (!server->ops->set_file_info)
+		return -ENOSYS;
+
 	if (attrs->ia_valid & ATTR_ATIME) {
 		set_time = true;
 		info_buf.LastAccessTime =
@@ -970,7 +1027,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	 * server times.
 	 */
 	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		cFYI(1, "CIFS - CTIME changed");
+		cifs_dbg(FYI, "CIFS - CTIME changed\n");
 		info_buf.ChangeTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
 	} else
@@ -979,89 +1036,27 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	info_buf.CreationTime = 0;	/* don't change */
 	info_buf.Attributes = cpu_to_le32(dosattr);
 
-	/*
-	 * If the file is already open for write, just use that fileid
-	 */
-	open_file = find_writable_file(cifsInode, true);
-	if (open_file) {
-		netfid = open_file->netfid;
-		netpid = open_file->pid;
-		pTcon = tlink_tcon(open_file->tlink);
-		goto set_via_filehandle;
-	}
-
-	tlink = cifs_sb_tlink(cifs_sb);
-	if (IS_ERR(tlink)) {
-		rc = PTR_ERR(tlink);
-		tlink = NULL;
-		goto out;
-	}
-	pTcon = tlink_tcon(tlink);
-
-	/*
-	 * NT4 apparently returns success on this call, but it doesn't
-	 * really work.
-	 */
-	if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
-		rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
-				     &info_buf, cifs_sb->local_nls,
-				     cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			cifsInode->cifsAttrs = dosattr;
-			goto out;
-		} else if (rc != -EOPNOTSUPP && rc != -EINVAL)
-			goto out;
-	}
-
-	cFYI(1, "calling SetFileInfo since SetPathInfo for "
-		 "times not supported by this server");
-	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
-			 CREATE_NOT_DIR, &netfid, &oplock,
-			 NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-
-	if (rc != 0) {
-		if (rc == -EIO)
-			rc = -EINVAL;
-		goto out;
-	}
-
-	netpid = current->tgid;
-
-set_via_filehandle:
-	rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
-	if (!rc)
-		cifsInode->cifsAttrs = dosattr;
-
-	if (open_file == NULL)
-		CIFSSMBClose(xid, pTcon, netfid);
-	else
-		cifsFileInfo_put(open_file);
-out:
-	if (tlink != NULL)
-		cifs_put_tlink(tlink);
-	return rc;
+	return server->ops->set_file_info(inode, full_path, &info_buf, xid);
 }
 
 /*
- * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
  * and rename it to a random name that hopefully won't conflict with
  * anything else.
  */
-static int
-cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
+int
+cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
+			   const unsigned int xid)
 {
 	int oplock = 0;
 	int rc;
-	__u16 netfid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 	__u32 dosattr, origattr;
 	FILE_BASIC_INFO *info_buf = NULL;
 
@@ -1070,10 +1065,25 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
-	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
-			 &netfid, &oplock, NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	/*
+	 * We cannot rename the file if the server doesn't support
+	 * CAP_INFOLEVEL_PASSTHRU
+	 */
+	if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc != 0)
 		goto out;
 
@@ -1094,28 +1104,29 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 			goto out_close;
 		}
 		info_buf->Attributes = cpu_to_le32(dosattr);
-		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
 					current->tgid);
 		/* although we would like to mark the file hidden
  		   if that fails we will still try to rename it */
-		if (rc != 0)
+		if (!rc)
 			cifsInode->cifsAttrs = dosattr;
 		else
 			dosattr = origattr; /* since not able to change them */
 	}
 
 	/* rename the file */
-	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+	rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
+				   cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0) {
-		rc = -ETXTBSY;
+		rc = -EBUSY;
 		goto undo_setattr;
 	}
 
 	/* try to set DELETE_ON_CLOSE */
-	if (!cifsInode->delete_pending) {
-		rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+	if (!test_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags)) {
+		rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid,
 					       current->tgid);
 		/*
 		 * some samba versions return -ENOENT when we try to set the
@@ -1128,14 +1139,14 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 		if (rc == -ENOENT)
 			rc = 0;
 		else if (rc != 0) {
-			rc = -ETXTBSY;
+			rc = -EBUSY;
 			goto undo_rename;
 		}
-		cifsInode->delete_pending = true;
+		set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
 	}
 
 out_close:
-	CIFSSMBClose(xid, tcon, netfid);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 out:
 	kfree(info_buf);
 	cifs_put_tlink(tlink);
@@ -1147,13 +1158,13 @@ out:
 	 * them anyway.
 	 */
 undo_rename:
-	CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+	CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
 				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 undo_setattr:
 	if (dosattr != origattr) {
 		info_buf->Attributes = cpu_to_le32(origattr);
-		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
 					current->tgid))
 			cifsInode->cifsAttrs = origattr;
 	}
@@ -1161,6 +1172,15 @@ undo_setattr:
 	goto out_close;
 }
 
+/* copied from fs/nfs/dir.c with small changes */
+static void
+cifs_drop_nlink(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	spin_unlock(&inode->i_lock);
+}
 
 /*
  * If dentry->d_inode is null (usually meaning the cached dentry
@@ -1172,25 +1192,27 @@ undo_setattr:
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc = 0;
-	int xid;
+	unsigned int xid;
 	char *full_path = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifs_inode;
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
-	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
+	cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	/* Unlink can be called from rename so we can not take the
 	 * sb->s_vfs_rename_mutex here */
@@ -1200,31 +1222,37 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlink_out;
 	}
 
-	if ((tcon->ses->capabilities & CAP_UNIX) &&
-		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
-			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+	if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
 		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
 			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, "posix del rc %d", rc);
+		cifs_dbg(FYI, "posix del rc %d\n", rc);
 		if ((rc == 0) || (rc == -ENOENT))
 			goto psx_del_no_retry;
 	}
 
 retry_std_delete:
-	rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (!server->ops->unlink) {
+		rc = -ENOSYS;
+		goto psx_del_no_retry;
+	}
+
+	rc = server->ops->unlink(xid, tcon, full_path, cifs_sb);
 
 psx_del_no_retry:
 	if (!rc) {
 		if (inode)
-			drop_nlink(inode);
+			cifs_drop_nlink(inode);
 	} else if (rc == -ENOENT) {
 		d_drop(dentry);
-	} else if (rc == -ETXTBSY) {
-		rc = cifs_rename_pending_delete(full_path, dentry, xid);
-		if (rc == 0)
-			drop_nlink(inode);
+	} else if (rc == -EBUSY) {
+		if (server->ops->rename_pending_delete) {
+			rc = server->ops->rename_pending_delete(full_path,
+								dentry, xid);
+			if (rc == 0)
+				cifs_drop_nlink(inode);
+		}
 	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
 		if (attrs == NULL) {
@@ -1266,31 +1294,168 @@ out_reval:
 unlink_out:
 	kfree(full_path);
 	kfree(attrs);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
 
-int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
+static int
+cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
+		 const char *full_path, struct cifs_sb_info *cifs_sb,
+		 struct cifs_tcon *tcon, const unsigned int xid)
 {
-	int rc = 0, tmprc;
-	int xid;
-	struct cifs_sb_info *cifs_sb;
-	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
-	char *full_path = NULL;
+	int rc = 0;
+	struct inode *inode = NULL;
+
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
+					      xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
+					 xid, NULL);
+
+	if (rc)
+		return rc;
+
+	/*
+	 * setting nlink not necessary except in cases where we failed to get it
+	 * from the server or was set bogus. Also, since this is a brand new
+	 * inode, no need to grab the i_lock before setting the i_nlink.
+	 */
+	if (inode->i_nlink < 2)
+		set_nlink(inode, 2);
+	mode &= ~current_umask();
+	/* must turn on setgid bit if parent dir has it */
+	if (parent->i_mode & S_ISGID)
+		mode |= S_ISGID;
+
+	if (tcon->unix_ext) {
+		struct cifs_unix_set_info_args args = {
+			.mode	= mode,
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= 0,
+		};
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = current_fsuid();
+			if (parent->i_mode & S_ISGID)
+				args.gid = parent->i_gid;
+			else
+				args.gid = current_fsgid();
+		} else {
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
+		}
+		CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+				       cifs_sb->local_nls,
+				       cifs_sb->mnt_cifs_flags &
+				       CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else {
+		struct TCP_Server_Info *server = tcon->ses->server;
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+		    (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
+			server->ops->mkdir_setinfo(inode, full_path, cifs_sb,
+						   tcon, xid);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+			inode->i_mode = (mode | S_IFDIR);
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			inode->i_uid = current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				inode->i_gid = parent->i_gid;
+			else
+				inode->i_gid = current_fsgid();
+		}
+	}
+	d_instantiate(dentry, inode);
+	return rc;
+}
+
+static int
+cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
+		 const char *full_path, struct cifs_sb_info *cifs_sb,
+		 struct cifs_tcon *tcon, const unsigned int xid)
+{
+	int rc = 0;
+	u32 oplock = 0;
+	FILE_UNIX_BASIC_INFO *info = NULL;
 	struct inode *newinode = NULL;
 	struct cifs_fattr fattr;
 
-	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
+	info = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (info == NULL) {
+		rc = -ENOMEM;
+		goto posix_mkdir_out;
+	}
+
+	mode &= ~current_umask();
+	rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode,
+			     NULL /* netfid */, info, &oplock, full_path,
+			     cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+			     CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc == -EOPNOTSUPP)
+		goto posix_mkdir_out;
+	else if (rc) {
+		cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc);
+		d_drop(dentry);
+		goto posix_mkdir_out;
+	}
+
+	if (info->Type == cpu_to_le32(-1))
+		/* no return info, go query for it */
+		goto posix_mkdir_get_info;
+	/*
+	 * BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if
+	 * need to set uid/gid.
+	 */
+
+	cifs_unix_basic_to_fattr(&fattr, info, cifs_sb);
+	cifs_fill_uniqueid(inode->i_sb, &fattr);
+	newinode = cifs_iget(inode->i_sb, &fattr);
+	if (!newinode)
+		goto posix_mkdir_get_info;
+
+	d_instantiate(dentry, newinode);
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n",
+		 dentry, dentry->d_name.name, newinode);
+
+	if (newinode->i_nlink != 2)
+		cifs_dbg(FYI, "unexpected number of links %d\n",
+			 newinode->i_nlink);
+#endif
+
+posix_mkdir_out:
+	kfree(info);
+	return rc;
+posix_mkdir_get_info:
+	rc = cifs_mkdir_qinfo(inode, dentry, mode, full_path, cifs_sb, tcon,
+			      xid);
+	goto posix_mkdir_out;
+}
+
+int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
+{
+	int rc = 0;
+	unsigned int xid;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	char *full_path;
+
+	cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+		 mode, inode);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -1298,153 +1463,39 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		goto mkdir_out;
 	}
 
-	if ((pTcon->ses->capabilities & CAP_UNIX) &&
-		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
-			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
-		u32 oplock = 0;
-		FILE_UNIX_BASIC_INFO *pInfo =
-			kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-		if (pInfo == NULL) {
-			rc = -ENOMEM;
+	if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb,
+				      tcon, xid);
+		if (rc != -EOPNOTSUPP)
 			goto mkdir_out;
-		}
-
-		mode &= ~current_umask();
-		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
-				mode, NULL /* netfid */, pInfo, &oplock,
-				full_path, cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == -EOPNOTSUPP) {
-			kfree(pInfo);
-			goto mkdir_retry_old;
-		} else if (rc) {
-			cFYI(1, "posix mkdir returned 0x%x", rc);
-			d_drop(direntry);
-		} else {
-			if (pInfo->Type == cpu_to_le32(-1)) {
-				/* no return info, go query for it */
-				kfree(pInfo);
-				goto mkdir_get_info;
-			}
-/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
-	to set uid/gid */
-			inc_nlink(inode);
-
-			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
-			cifs_fill_uniqueid(inode->i_sb, &fattr);
-			newinode = cifs_iget(inode->i_sb, &fattr);
-			if (!newinode) {
-				kfree(pInfo);
-				goto mkdir_get_info;
-			}
+	}
 
-			d_instantiate(direntry, newinode);
+	server = tcon->ses->server;
 
-#ifdef CONFIG_CIFS_DEBUG2
-			cFYI(1, "instantiated dentry %p %s to inode %p",
-				direntry, direntry->d_name.name, newinode);
-
-			if (newinode->i_nlink != 2)
-				cFYI(1, "unexpected number of links %d",
-					newinode->i_nlink);
-#endif
-		}
-		kfree(pInfo);
+	if (!server->ops->mkdir) {
+		rc = -ENOSYS;
 		goto mkdir_out;
 	}
-mkdir_retry_old:
+
 	/* BB add setting the equivalent of mode via CreateX w/ACLs */
-	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
-			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
 	if (rc) {
-		cFYI(1, "cifs_mkdir returned 0x%x", rc);
+		cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
 		d_drop(direntry);
-	} else {
-mkdir_get_info:
-		inc_nlink(inode);
-		if (pTcon->unix_ext)
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
-						      inode->i_sb, xid);
-		else
-			rc = cifs_get_inode_info(&newinode, full_path, NULL,
-						 inode->i_sb, xid, NULL);
-
-		d_instantiate(direntry, newinode);
-		 /* setting nlink not necessary except in cases where we
-		  * failed to get it from the server or was set bogus */
-		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
-				direntry->d_inode->i_nlink = 2;
-
-		mode &= ~current_umask();
-		/* must turn on setgid bit if parent dir has it */
-		if (inode->i_mode & S_ISGID)
-			mode |= S_ISGID;
-
-		if (pTcon->unix_ext) {
-			struct cifs_unix_set_info_args args = {
-				.mode	= mode,
-				.ctime	= NO_CHANGE_64,
-				.atime	= NO_CHANGE_64,
-				.mtime	= NO_CHANGE_64,
-				.device	= 0,
-			};
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				args.uid = (__u64)current_fsuid();
-				if (inode->i_mode & S_ISGID)
-					args.gid = (__u64)inode->i_gid;
-				else
-					args.gid = (__u64)current_fsgid();
-			} else {
-				args.uid = NO_CHANGE_64;
-				args.gid = NO_CHANGE_64;
-			}
-			CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
-					       cifs_sb->local_nls,
-					       cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-		} else {
-			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
-			    (mode & S_IWUGO) == 0) {
-				FILE_BASIC_INFO pInfo;
-				struct cifsInodeInfo *cifsInode;
-				u32 dosattrs;
-
-				memset(&pInfo, 0, sizeof(pInfo));
-				cifsInode = CIFS_I(newinode);
-				dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
-				pInfo.Attributes = cpu_to_le32(dosattrs);
-				tmprc = CIFSSMBSetPathInfo(xid, pTcon,
-						full_path, &pInfo,
-						cifs_sb->local_nls,
-						cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (tmprc == 0)
-					cifsInode->cifsAttrs = dosattrs;
-			}
-			if (direntry->d_inode) {
-				if (cifs_sb->mnt_cifs_flags &
-				     CIFS_MOUNT_DYNPERM)
-					direntry->d_inode->i_mode =
-						(mode | S_IFDIR);
-
-				if (cifs_sb->mnt_cifs_flags &
-				     CIFS_MOUNT_SET_UID) {
-					direntry->d_inode->i_uid =
-						current_fsuid();
-					if (inode->i_mode & S_ISGID)
-						direntry->d_inode->i_gid =
-							inode->i_gid;
-					else
-						direntry->d_inode->i_gid =
-							current_fsgid();
-				}
-			}
-		}
+		goto mkdir_out;
 	}
+
+	rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
+			      xid);
 mkdir_out:
+	/*
+	 * Force revalidate to get parent dir info when needed since cached
+	 * attributes are invalid now.
+	 */
+	CIFS_I(inode)->time = 0;
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -1452,16 +1503,17 @@ mkdir_out:
 int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 {
 	int rc = 0;
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
 
-	cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
+	cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -1475,14 +1527,19 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 		rc = PTR_ERR(tlink);
 		goto rmdir_exit;
 	}
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->rmdir) {
+		rc = -ENOSYS;
+		cifs_put_tlink(tlink);
+		goto rmdir_exit;
+	}
 
-	rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
-			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
 	cifs_put_tlink(tlink);
 
 	if (!rc) {
-		drop_nlink(inode);
 		spin_lock(&direntry->d_inode->i_lock);
 		i_size_write(direntry->d_inode, 0);
 		clear_nlink(direntry->d_inode);
@@ -1490,84 +1547,98 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	}
 
 	cifsInode = CIFS_I(direntry->d_inode);
-	cifsInode->time = 0;	/* force revalidate to go get info when
-				   needed */
+	/* force revalidate to go get info when needed */
+	cifsInode->time = 0;
 
 	cifsInode = CIFS_I(inode);
-	cifsInode->time = 0;	/* force revalidate to get parent dir info
-				   since cached search results now invalid */
+	/*
+	 * Force revalidate to get parent dir info when needed since cached
+	 * attributes are invalid now.
+	 */
+	cifsInode->time = 0;
 
 	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
 		current_fs_time(inode->i_sb);
 
 rmdir_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
 static int
-cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
-		struct dentry *to_dentry, const char *toPath)
+cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
+	       const char *from_path, struct dentry *to_dentry,
+	       const char *to_path)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
-	__u16 srcfid;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
 	int oplock, rc;
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
+
+	if (!server->ops->rename)
+		return -ENOSYS;
 
 	/* try path-based rename first */
-	rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
-			   cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb);
 
 	/*
-	 * don't bother with rename by filehandle unless file is busy and
-	 * source Note that cross directory moves do not work with
+	 * Don't bother with rename by filehandle unless file is busy and
+	 * source. Note that cross directory moves do not work with
 	 * rename by filehandle to various Windows servers.
 	 */
-	if (rc == 0 || rc != -ETXTBSY)
+	if (rc == 0 || rc != -EBUSY)
 		goto do_rename_exit;
 
 	/* open-file renames don't work across directories */
 	if (to_dentry->d_parent != from_dentry->d_parent)
 		goto do_rename_exit;
 
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
 	/* open the file to be renamed -- we need DELETE perms */
-	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
-			 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
-			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-
+	oparms.desired_access = DELETE;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = from_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc == 0) {
-		rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
+		rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
 				(const char *) to_dentry->d_name.name,
 				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-
-		CIFSSMBClose(xid, pTcon, srcfid);
+		CIFSSMBClose(xid, tcon, fid.netfid);
 	}
 do_rename_exit:
 	cifs_put_tlink(tlink);
 	return rc;
 }
 
-int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
-	struct inode *target_dir, struct dentry *target_dentry)
+int
+cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
+	    struct inode *target_dir, struct dentry *target_dentry)
 {
-	char *fromName = NULL;
-	char *toName = NULL;
+	char *from_name = NULL;
+	char *to_name = NULL;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
 	FILE_UNIX_BASIC_INFO *info_buf_target;
-	int xid, rc, tmprc;
+	unsigned int xid;
+	int rc, tmprc;
 
 	cifs_sb = CIFS_SB(source_dir->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -1575,31 +1646,31 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	/*
 	 * we already have the rename sem so we do not need to
 	 * grab it again here to protect the path integrity
 	 */
-	fromName = build_path_from_dentry(source_dentry);
-	if (fromName == NULL) {
+	from_name = build_path_from_dentry(source_dentry);
+	if (from_name == NULL) {
 		rc = -ENOMEM;
 		goto cifs_rename_exit;
 	}
 
-	toName = build_path_from_dentry(target_dentry);
-	if (toName == NULL) {
+	to_name = build_path_from_dentry(target_dentry);
+	if (to_name == NULL) {
 		rc = -ENOMEM;
 		goto cifs_rename_exit;
 	}
 
-	rc = cifs_do_rename(xid, source_dentry, fromName,
-			    target_dentry, toName);
+	rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
+			    to_name);
 
 	if (rc == -EEXIST && tcon->unix_ext) {
 		/*
-		 * Are src and dst hardlinks of same inode? We can
-		 * only tell with unix extensions enabled
+		 * Are src and dst hardlinks of same inode? We can only tell
+		 * with unix extensions enabled.
 		 */
 		info_buf_source =
 			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
@@ -1610,19 +1681,19 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 		}
 
 		info_buf_target = info_buf_source + 1;
-		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
-					info_buf_source,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name,
+					     info_buf_source,
+					     cifs_sb->local_nls,
+					     cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if (tmprc != 0)
 			goto unlink_target;
 
-		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
-					info_buf_target,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name,
+					     info_buf_target,
+					     cifs_sb->local_nls,
+					     cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 		if (tmprc == 0 && (info_buf_source->UniqueId ==
 				   info_buf_target->UniqueId)) {
@@ -1630,8 +1701,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 			rc = 0;
 			goto cifs_rename_exit;
 		}
-	} /* else ... BB we could add the same check for Windows by
-		     checking the UniqueId via FILE_INTERNAL_INFO */
+	}
+	/*
+	 * else ... BB we could add the same check for Windows by
+	 * checking the UniqueId via FILE_INTERNAL_INFO
+	 */
 
 unlink_target:
 	/* Try unlinking the target dentry if it's not negative */
@@ -1639,16 +1713,15 @@ unlink_target:
 		tmprc = cifs_unlink(target_dir, target_dentry);
 		if (tmprc)
 			goto cifs_rename_exit;
-
-		rc = cifs_do_rename(xid, source_dentry, fromName,
-				    target_dentry, toName);
+		rc = cifs_do_rename(xid, source_dentry, from_name,
+				    target_dentry, to_name);
 	}
 
 cifs_rename_exit:
 	kfree(info_buf_source);
-	kfree(fromName);
-	kfree(toName);
-	FreeXid(xid);
+	kfree(from_name);
+	kfree(to_name);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -1659,7 +1732,7 @@ cifs_inode_needs_reval(struct inode *inode)
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_i->clientCanCacheRead)
+	if (CIFS_CACHE_READ(cifs_i))
 		return false;
 
 	if (!lookupCacheEnabled)
@@ -1668,6 +1741,9 @@ cifs_inode_needs_reval(struct inode *inode)
 	if (cifs_i->time == 0)
 		return true;
 
+	if (!cifs_sb->actimeo)
+		return true;
+
 	if (!time_in_range(jiffies, cifs_i->time,
 				cifs_i->time + cifs_sb->actimeo))
 		return true;
@@ -1683,71 +1759,109 @@ cifs_inode_needs_reval(struct inode *inode)
 /*
  * Zap the cache. Called when invalid_mapping flag is set.
  */
-void
+int
 cifs_invalidate_mapping(struct inode *inode)
 {
-	int rc;
-	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
-
-	cifs_i->invalid_mapping = false;
+	int rc = 0;
 
-	/* write back any cached data */
 	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
-		rc = filemap_write_and_wait(inode->i_mapping);
-		mapping_set_error(inode->i_mapping, rc);
+		rc = invalidate_inode_pages2(inode->i_mapping);
+		if (rc)
+			cifs_dbg(VFS, "%s: could not invalidate inode %p\n",
+				 __func__, inode);
 	}
-	invalidate_remote_inode(inode);
+
 	cifs_fscache_reset_inode_cookie(inode);
+	return rc;
 }
 
-int cifs_revalidate_file(struct file *filp)
+/**
+ * cifs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+static int
+cifs_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	freezable_schedule_unsafe();
+	return 0;
+}
+
+int
+cifs_revalidate_mapping(struct inode *inode)
+{
+	int rc;
+	unsigned long *flags = &CIFS_I(inode)->flags;
+
+	rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
+				TASK_KILLABLE);
+	if (rc)
+		return rc;
+
+	if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) {
+		rc = cifs_invalidate_mapping(inode);
+		if (rc)
+			set_bit(CIFS_INO_INVALID_MAPPING, flags);
+	}
+
+	clear_bit_unlock(CIFS_INO_LOCK, flags);
+	smp_mb__after_atomic();
+	wake_up_bit(flags, CIFS_INO_LOCK);
+
+	return rc;
+}
+
+int
+cifs_zap_mapping(struct inode *inode)
+{
+	set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
+	return cifs_revalidate_mapping(inode);
+}
+
+int cifs_revalidate_file_attr(struct file *filp)
 {
 	int rc = 0;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
 
 	if (!cifs_inode_needs_reval(inode))
-		goto check_inval;
+		return rc;
 
 	if (tlink_tcon(cfile->tlink)->unix_ext)
 		rc = cifs_get_file_info_unix(filp);
 	else
 		rc = cifs_get_file_info(filp);
 
-check_inval:
-	if (CIFS_I(inode)->invalid_mapping)
-		cifs_invalidate_mapping(inode);
-
 	return rc;
 }
 
-/* revalidate a dentry's inode attributes */
-int cifs_revalidate_dentry(struct dentry *dentry)
+int cifs_revalidate_dentry_attr(struct dentry *dentry)
 {
-	int xid;
+	unsigned int xid;
 	int rc = 0;
-	char *full_path = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = dentry->d_sb;
+	char *full_path = NULL;
 
 	if (inode == NULL)
 		return -ENOENT;
 
-	xid = GetXid();
-
 	if (!cifs_inode_needs_reval(inode))
-		goto check_inval;
+		return rc;
+
+	xid = get_xid();
 
 	/* can not safely grab the rename sem here if rename calls revalidate
 	   since that would deadlock */
 	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		goto check_inval;
+		goto out;
 	}
 
-	cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-		 "jiffies %ld", full_path, inode, inode->i_count.counter,
+	cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
+		 full_path, inode, inode->i_count.counter,
 		 dentry, dentry->d_time, jiffies);
 
 	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
@@ -1756,41 +1870,80 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
 					 xid, NULL);
 
-check_inval:
-	if (CIFS_I(inode)->invalid_mapping)
-		cifs_invalidate_mapping(inode);
-
+out:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc;
+	struct inode *inode = file_inode(filp);
+
+	rc = cifs_revalidate_file_attr(filp);
+	if (rc)
+		return rc;
+
+	return cifs_revalidate_mapping(inode);
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int rc;
+	struct inode *inode = dentry->d_inode;
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	return cifs_revalidate_mapping(inode);
+}
+
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
-	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-	int err = cifs_revalidate_dentry(dentry);
-
-	if (!err) {
-		generic_fillattr(dentry->d_inode, stat);
-		stat->blksize = CIFS_MAX_MSGSIZE;
-		stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct inode *inode = dentry->d_inode;
+	int rc;
 
-		/*
-		 * If on a multiuser mount without unix extensions, and the
-		 * admin hasn't overridden them, set the ownership to the
-		 * fsuid/fsgid of the current process.
-		 */
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
-		    !tcon->unix_ext) {
-			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
-				stat->uid = current_fsuid();
-			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
-				stat->gid = current_fsgid();
+	/*
+	 * We need to be sure that all dirty pages are written and the server
+	 * has actual ctime, mtime and file length.
+	 */
+	if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
+	    inode->i_mapping->nrpages != 0) {
+		rc = filemap_fdatawait(inode->i_mapping);
+		if (rc) {
+			mapping_set_error(inode->i_mapping, rc);
+			return rc;
 		}
 	}
-	return err;
+
+	rc = cifs_revalidate_dentry_attr(dentry);
+	if (rc)
+		return rc;
+
+	generic_fillattr(inode, stat);
+	stat->blksize = CIFS_MAX_MSGSIZE;
+	stat->ino = CIFS_I(inode)->uniqueid;
+
+	/*
+	 * If on a multiuser mount without unix extensions or cifsacl being
+	 * enabled, and the admin hasn't overridden them, set the ownership
+	 * to the fsuid/fsgid of the current process.
+	 */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+	    !tcon->unix_ext) {
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+			stat->uid = current_fsuid();
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+			stat->gid = current_fsgid();
+	}
+	return rc;
 }
 
 static int cifs_truncate_page(struct address_space *mapping, loff_t from)
@@ -1812,26 +1965,25 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
 static void cifs_setsize(struct inode *inode, loff_t offset)
 {
-	loff_t oldsize;
-
 	spin_lock(&inode->i_lock);
-	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 
-	truncate_pagecache(inode, oldsize, offset);
+	truncate_pagecache(inode, offset);
 }
 
 static int
 cifs_set_file_size(struct inode *inode, struct iattr *attrs,
-		   int xid, char *full_path)
+		   unsigned int xid, char *full_path)
 {
 	int rc;
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = NULL;
-	struct cifsTconInfo *pTcon = NULL;
+	struct cifs_tcon *tcon = NULL;
+	struct TCP_Server_Info *server;
+	struct cifs_io_parms io_parms;
 
 	/*
 	 * To avoid spurious oplock breaks from server, in the case of
@@ -1844,63 +1996,79 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	 */
 	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
-		__u16 nfid = open_file->netfid;
-		__u32 npid = open_file->pid;
-		pTcon = tlink_tcon(open_file->tlink);
-		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
-					npid, false);
+		tcon = tlink_tcon(open_file->tlink);
+		server = tcon->ses->server;
+		if (server->ops->set_file_size)
+			rc = server->ops->set_file_size(xid, tcon, open_file,
+							attrs->ia_size, false);
+		else
+			rc = -ENOSYS;
 		cifsFileInfo_put(open_file);
-		cFYI(1, "SetFSize for attrs rc = %d", rc);
+		cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			unsigned int bytes_written;
-			rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
-					  &bytes_written, NULL, NULL, 1);
-			cFYI(1, "Wrt seteof rc %d", rc);
+
+			io_parms.netfid = open_file->fid.netfid;
+			io_parms.pid = open_file->pid;
+			io_parms.tcon = tcon;
+			io_parms.offset = 0;
+			io_parms.length = attrs->ia_size;
+			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
+					  NULL, NULL, 1);
+			cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
 		}
 	} else
 		rc = -EINVAL;
 
-	if (rc != 0) {
-		if (pTcon == NULL) {
-			tlink = cifs_sb_tlink(cifs_sb);
-			if (IS_ERR(tlink))
-				return PTR_ERR(tlink);
-			pTcon = tlink_tcon(tlink);
-		}
+	if (!rc)
+		goto set_size_out;
+
+	if (tcon == NULL) {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink))
+			return PTR_ERR(tlink);
+		tcon = tlink_tcon(tlink);
+		server = tcon->ses->server;
+	}
 
-		/* Set file size by pathname rather than by handle
-		   either because no valid, writeable file handle for
-		   it was found or because there was an error setting
-		   it by handle */
-		rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,
-				   false, cifs_sb->local_nls,
+	/*
+	 * Set file size by pathname rather than by handle either because no
+	 * valid, writeable file handle for it was found or because there was
+	 * an error setting it by handle.
+	 */
+	if (server->ops->set_path_size)
+		rc = server->ops->set_path_size(xid, tcon, full_path,
+						attrs->ia_size, cifs_sb, false);
+	else
+		rc = -ENOSYS;
+	cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
+	if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+		__u16 netfid;
+		int oplock = 0;
+
+		rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
+				   GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
+				   &oplock, NULL, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
-		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-			__u16 netfid;
-			int oplock = 0;
-
-			rc = SMBLegacyOpen(xid, pTcon, full_path,
-				FILE_OPEN, GENERIC_WRITE,
-				CREATE_NOT_DIR, &netfid, &oplock, NULL,
-				cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (rc == 0) {
-				unsigned int bytes_written;
-				rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
-						  attrs->ia_size,
-						  &bytes_written, NULL,
-						  NULL, 1);
-				cFYI(1, "wrt seteof rc %d", rc);
-				CIFSSMBClose(xid, pTcon, netfid);
-			}
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc == 0) {
+			unsigned int bytes_written;
+
+			io_parms.netfid = netfid;
+			io_parms.pid = current->tgid;
+			io_parms.tcon = tcon;
+			io_parms.offset = 0;
+			io_parms.length = attrs->ia_size;
+			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
+					  NULL,  1);
+			cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
+			CIFSSMBClose(xid, tcon, netfid);
 		}
-		if (tlink)
-			cifs_put_tlink(tlink);
 	}
+	if (tlink)
+		cifs_put_tlink(tlink);
 
+set_size_out:
 	if (rc == 0) {
 		cifsInode->server_eof = attrs->ia_size;
 		cifs_setsize(inode, attrs->ia_size);
@@ -1914,20 +2082,20 @@ static int
 cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 {
 	int rc;
-	int xid;
+	unsigned int xid;
 	char *full_path = NULL;
 	struct inode *inode = direntry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *pTcon;
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
 
-	cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
+	cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n",
 		 direntry->d_name.name, attrs->ia_valid);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		attrs->ia_valid |= ATTR_FORCE;
@@ -1982,12 +2150,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if (attrs->ia_valid & ATTR_UID)
 		args->uid = attrs->ia_uid;
 	else
-		args->uid = NO_CHANGE_64;
+		args->uid = INVALID_UID; /* no change */
 
 	if (attrs->ia_valid & ATTR_GID)
 		args->gid = attrs->ia_gid;
 	else
-		args->gid = NO_CHANGE_64;
+		args->gid = INVALID_GID; /* no change */
 
 	if (attrs->ia_valid & ATTR_ATIME)
 		args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
@@ -2007,7 +2175,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	args->device = 0;
 	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
-		u16 nfid = open_file->netfid;
+		u16 nfid = open_file->fid.netfid;
 		u32 npid = open_file->pid;
 		pTcon = tlink_tcon(open_file->tlink);
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
@@ -2047,14 +2215,16 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 out:
 	kfree(args);
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
 static int
 cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 {
-	int xid;
+	unsigned int xid;
+	kuid_t uid = INVALID_UID;
+	kgid_t gid = INVALID_GID;
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2063,9 +2233,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	__u32 dosattr = 0;
 	__u64 mode = NO_CHANGE_64;
 
-	xid = GetXid();
+	xid = get_xid();
 
-	cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
+	cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n",
 		 direntry->d_name.name, attrs->ia_valid);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
@@ -2073,14 +2243,14 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	rc = inode_change_ok(inode, attrs);
 	if (rc < 0) {
-		FreeXid(xid);
+		free_xid(xid);
 		return rc;
 	}
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
+		free_xid(xid);
 		return rc;
 	}
 
@@ -2105,13 +2275,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 			goto cifs_setattr_exit;
 	}
 
-	/*
-	 * Without unix extensions we can't send ownership changes to the
-	 * server, so silently ignore them. This is consistent with how
-	 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
-	 * CIFSACL support + proper Windows to Unix idmapping, we may be
-	 * able to support this in the future.
-	 */
+	if (attrs->ia_valid & ATTR_UID)
+		uid = attrs->ia_uid;
+
+	if (attrs->ia_valid & ATTR_GID)
+		gid = attrs->ia_gid;
+
+#ifdef CONFIG_CIFS_ACL
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		if (uid_valid(uid) || gid_valid(gid)) {
+			rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
+							uid, gid);
+			if (rc) {
+				cifs_dbg(FYI, "%s: Setting id failed with error: %d\n",
+					 __func__, rc);
+				goto cifs_setattr_exit;
+			}
+		}
+	} else
+#endif /* CONFIG_CIFS_ACL */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
 		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
 
@@ -2120,18 +2302,15 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 		attrs->ia_valid &= ~ATTR_MODE;
 
 	if (attrs->ia_valid & ATTR_MODE) {
-		cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
 		mode = attrs->ia_mode;
-	}
-
-	if (attrs->ia_valid & ATTR_MODE) {
 		rc = 0;
 #ifdef CONFIG_CIFS_ACL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-			rc = mode_to_cifs_acl(inode, full_path, mode);
+			rc = id_mode_to_cifs_acl(inode, full_path, mode,
+						INVALID_UID, INVALID_GID);
 			if (rc) {
-				cFYI(1, "%s: Setting ACL failed with error: %d",
-					__func__, rc);
+				cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n",
+					 __func__, rc);
 				goto cifs_setattr_exit;
 			}
 		} else
@@ -2197,7 +2376,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 cifs_setattr_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
 
@@ -2206,7 +2385,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (pTcon->unix_ext)
 		return cifs_setattr_unix(direntry, attrs);
@@ -2219,7 +2398,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-	cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
+	cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode);
 	/* may have to add back in if and when safe distributed caching of
 	   directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 0c98672d012..45cb59bcc79 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with io control
  *
- *   Copyright (C) International Business Machines  Corp., 2005,2007
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -22,81 +22,207 @@
  */
 
 #include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
 
-#define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2)
+#define CIFS_IOCTL_MAGIC	0xCF
+#define CIFS_IOC_COPYCHUNK_FILE	_IOW(CIFS_IOCTL_MAGIC, 3, int)
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+			unsigned long srcfd, u64 off, u64 len, u64 destoff)
+{
+	int rc;
+	struct cifsFileInfo *smb_file_target = dst_file->private_data;
+	struct inode *target_inode = file_inode(dst_file);
+	struct cifs_tcon *target_tcon;
+	struct fd src_file;
+	struct cifsFileInfo *smb_file_src;
+	struct inode *src_inode;
+	struct cifs_tcon *src_tcon;
+
+	cifs_dbg(FYI, "ioctl clone range\n");
+	/* the destination must be opened for writing */
+	if (!(dst_file->f_mode & FMODE_WRITE)) {
+		cifs_dbg(FYI, "file target not open for write\n");
+		return -EINVAL;
+	}
+
+	/* check if target volume is readonly and take reference */
+	rc = mnt_want_write_file(dst_file);
+	if (rc) {
+		cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+		return rc;
+	}
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		rc = -EBADF;
+		goto out_drop_write;
+	}
+
+	if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+		rc = -EBADF;
+		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+		goto out_fput;
+	}
+
+	rc = -EXDEV;
+	smb_file_target = dst_file->private_data;
+	smb_file_src = src_file.file->private_data;
+	src_tcon = tlink_tcon(smb_file_src->tlink);
+	target_tcon = tlink_tcon(smb_file_target->tlink);
+
+	/* check if source and target are on same tree connection */
+	if (src_tcon != target_tcon) {
+		cifs_dbg(VFS, "file copy src and target on different volume\n");
+		goto out_fput;
+	}
+
+	src_inode = file_inode(src_file.file);
+
+	/*
+	 * Note: cifs case is easier than btrfs since server responsible for
+	 * checks for proper open modes and file type and if it wants
+	 * server could even support copy of range where source = target
+	 */
+
+	/* so we do not deadlock racing two ioctls on same files */
+	if (target_inode < src_inode) {
+		mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
+	}
+
+	/* determine range to clone */
+	rc = -EINVAL;
+	if (off + len > src_inode->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		len = src_inode->i_size - off;
+
+	cifs_dbg(FYI, "about to flush pages\n");
+	/* should we flush first and last page first */
+	truncate_inode_pages_range(&target_inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len)-1);
+
+	if (target_tcon->ses->server->ops->clone_range)
+		rc = target_tcon->ses->server->ops->clone_range(xid,
+			smb_file_src, smb_file_target, off, len, destoff);
+
+	/* force revalidate of size and timestamps of target file now
+	   that target is updated on the server */
+	CIFS_I(target_inode)->time = 0;
+out_unlock:
+	/* although unlocking in the reverse order from locking is not
+	   strictly necessary here it is a little cleaner to be consistent */
+	if (target_inode < src_inode) {
+		mutex_unlock(&src_inode->i_mutex);
+		mutex_unlock(&target_inode->i_mutex);
+	} else {
+		mutex_unlock(&target_inode->i_mutex);
+		mutex_unlock(&src_inode->i_mutex);
+	}
+out_fput:
+	fdput(src_file);
+out_drop_write:
+	mnt_drop_write_file(dst_file);
+	return rc;
+}
 
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
-	struct inode *inode = filep->f_dentry->d_inode;
+	struct inode *inode = file_inode(filep);
 	int rc = -ENOTTY; /* strange error - but the precedent */
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_POSIX
 	struct cifsFileInfo *pSMBFile = filep->private_data;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
 	__u64	ExtAttrBits = 0;
-	__u64	ExtAttrMask = 0;
 	__u64   caps;
-#endif /* CONFIG_CIFS_POSIX */
 
-	xid = GetXid();
+	xid = get_xid();
 
-	cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
+	cifs_dbg(FYI, "ioctl file %p  cmd %u  arg %lu\n", filep, command, arg);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	switch (command) {
-		case CIFS_IOC_CHECKUMOUNT:
-			cFYI(1, "User unmount attempted");
-			if (cifs_sb->mnt_uid == current_uid())
-				rc = 0;
-			else {
-				rc = -EACCES;
-				cFYI(1, "uids do not match");
-			}
-			break;
-#ifdef CONFIG_CIFS_POSIX
 		case FS_IOC_GETFLAGS:
 			if (pSMBFile == NULL)
 				break;
 			tcon = tlink_tcon(pSMBFile->tlink);
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+#ifdef CONFIG_CIFS_POSIX
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
-				rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
-					&ExtAttrBits, &ExtAttrMask);
+				__u64	ExtAttrMask = 0;
+				rc = CIFSGetExtAttr(xid, tcon,
+						    pSMBFile->fid.netfid,
+						    &ExtAttrBits, &ExtAttrMask);
 				if (rc == 0)
 					rc = put_user(ExtAttrBits &
 						FS_FL_USER_VISIBLE,
 						(int __user *)arg);
+				if (rc != EOPNOTSUPP)
+					break;
+			}
+#endif /* CONFIG_CIFS_POSIX */
+			rc = 0;
+			if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
+				/* add in the compressed bit */
+				ExtAttrBits = FS_COMPR_FL;
+				rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE,
+					      (int __user *)arg);
 			}
 			break;
-
 		case FS_IOC_SETFLAGS:
 			if (pSMBFile == NULL)
 				break;
 			tcon = tlink_tcon(pSMBFile->tlink);
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-			if (CIFS_UNIX_EXTATTR_CAP & caps) {
-				if (get_user(ExtAttrBits, (int __user *)arg)) {
-					rc = -EFAULT;
-					break;
-				}
-				/* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
-					extAttrBits, &ExtAttrMask);*/
+
+			if (get_user(ExtAttrBits, (int __user *)arg)) {
+				rc = -EFAULT;
+				break;
+			}
+
+			/*
+			 * if (CIFS_UNIX_EXTATTR_CAP & caps)
+			 *	rc = CIFSSetExtAttr(xid, tcon,
+			 *		       pSMBFile->fid.netfid,
+			 *		       extAttrBits,
+			 *		       &ExtAttrMask);
+			 * if (rc != EOPNOTSUPP)
+			 *	break;
+			 */
+
+			/* Currently only flag we can set is compressed flag */
+			if ((ExtAttrBits & FS_COMPR_FL) == 0)
+				break;
+
+			/* Try to set compress flag */
+			if (tcon->ses->server->ops->set_compression) {
+				rc = tcon->ses->server->ops->set_compression(
+							xid, tcon, pSMBFile);
+				cifs_dbg(FYI, "set compress flag rc %d\n", rc);
 			}
-			cFYI(1, "set flags not implemented yet");
 			break;
-#endif /* CONFIG_CIFS_POSIX */
+		case CIFS_IOC_COPYCHUNK_FILE:
+			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
+			break;
 		default:
-			cFYI(1, "unsupported ioctl");
+			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;
 	}
 
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e8804d37340..68559fd557f 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,10 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 
+/*
+ * M-F Symlink Functions - Begin
+ */
+
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
 #define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
@@ -56,14 +60,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 	md5 = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(md5)) {
 		rc = PTR_ERR(md5);
-		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+		cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n",
+			 __func__, rc);
 		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
 	sdescmd5 = kmalloc(size, GFP_KERNEL);
 	if (!sdescmd5) {
 		rc = -ENOMEM;
-		cERROR(1, "%s: Memory allocation failure\n", __func__);
 		goto symlink_hash_err;
 	}
 	sdescmd5->shash.tfm = md5;
@@ -71,11 +75,17 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 
 	rc = crypto_shash_init(&sdescmd5->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init md5 shash\n", __func__);
+		cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
+		goto symlink_hash_err;
+	}
+	rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
 		goto symlink_hash_err;
 	}
-	crypto_shash_update(&sdescmd5->shash, link_str, link_len);
 	rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 symlink_hash_err:
 	crypto_free_shash(md5);
@@ -85,10 +95,8 @@ symlink_hash_err:
 }
 
 static int
-CIFSParseMFSymlink(const u8 *buf,
-		   unsigned int buf_len,
-		   unsigned int *_link_len,
-		   char **_link_str)
+parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
+		 char **_link_str)
 {
 	int rc;
 	unsigned int link_len;
@@ -109,7 +117,7 @@ CIFSParseMFSymlink(const u8 *buf,
 
 	rc = symlink_hash(link_len, link_str, md5_hash);
 	if (rc) {
-		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
 		return rc;
 	}
 
@@ -131,7 +139,7 @@ CIFSParseMFSymlink(const u8 *buf,
 }
 
 static int
-CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
 	int rc;
 	unsigned int link_len;
@@ -148,7 +156,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 
 	rc = symlink_hash(link_len, link_str, md5_hash);
 	if (rc) {
-		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
 		return rc;
 	}
 
@@ -174,14 +182,26 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 	return 0;
 }
 
+bool
+couldbe_mf_symlink(const struct cifs_fattr *fattr)
+{
+	if (!S_ISREG(fattr->cf_mode))
+		/* it's not a symlink */
+		return false;
+
+	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+		/* it's not a symlink */
+		return false;
+
+	return true;
+}
+
 static int
-CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
-		    const char *fromName, const char *toName,
-		    const struct nls_table *nls_codepage, int remap)
+create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		  struct cifs_sb_info *cifs_sb, const char *fromName,
+		  const char *toName)
 {
 	int rc;
-	int oplock = 0;
-	__u16 netfid = 0;
 	u8 *buf;
 	unsigned int bytes_written = 0;
 
@@ -189,158 +209,87 @@ CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
 	if (!buf)
 		return -ENOMEM;
 
-	rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
-	if (rc != 0) {
-		kfree(buf);
-		return rc;
-	}
-
-	rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
-			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-			 nls_codepage, remap);
-	if (rc != 0) {
-		kfree(buf);
-		return rc;
-	}
+	rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+	if (rc)
+		goto out;
 
-	rc = CIFSSMBWrite(xid, tcon, netfid,
-			  CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-			  0 /* offset */,
-			  &bytes_written, buf, NULL, 0);
-	CIFSSMBClose(xid, tcon, netfid);
-	kfree(buf);
-	if (rc != 0)
-		return rc;
+	rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
+					fromName, buf, &bytes_written);
+	if (rc)
+		goto out;
 
 	if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
-		return -EIO;
-
-	return 0;
+		rc = -EIO;
+out:
+	kfree(buf);
+	return rc;
 }
 
 static int
-CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
-		   const unsigned char *searchName, char **symlinkinfo,
-		   const struct nls_table *nls_codepage, int remap)
+query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		 char **symlinkinfo)
 {
 	int rc;
-	int oplock = 0;
-	__u16 netfid = 0;
-	u8 *buf;
-	char *pbuf;
-	unsigned int bytes_read = 0;
-	int buf_type = CIFS_NO_BUFFER;
+	u8 *buf = NULL;
 	unsigned int link_len = 0;
-	FILE_ALL_INFO file_info;
-
-	rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
-			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
-			 nls_codepage, remap);
-	if (rc != 0)
-		return rc;
-
-	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
-		CIFSSMBClose(xid, tcon, netfid);
-		/* it's not a symlink */
-		return -EINVAL;
-	}
+	unsigned int bytes_read = 0;
 
 	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
-	pbuf = buf;
 
-	rc = CIFSSMBRead(xid, tcon, netfid,
-			 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-			 0 /* offset */,
-			 &bytes_read, &pbuf, &buf_type);
-	CIFSSMBClose(xid, tcon, netfid);
-	if (rc != 0) {
-		kfree(buf);
-		return rc;
-	}
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
+	else
+		rc = -ENOSYS;
 
-	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
-	kfree(buf);
-	if (rc != 0)
-		return rc;
-
-	return 0;
-}
-
-bool
-CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
-{
-	if (!(fattr->cf_mode & S_IFREG))
-		/* it's not a symlink */
-		return false;
+	if (rc)
+		goto out;
 
-	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
-		/* it's not a symlink */
-		return false;
+	if (bytes_read == 0) { /* not a symlink */
+		rc = -EINVAL;
+		goto out;
+	}
 
-	return true;
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
+out:
+	kfree(buf);
+	return rc;
 }
 
 int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
-		   const unsigned char *path,
-		   struct cifs_sb_info *cifs_sb, int xid)
+check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		 const unsigned char *path)
 {
 	int rc;
-	int oplock = 0;
-	__u16 netfid = 0;
-	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
-	u8 *buf;
-	char *pbuf;
-	unsigned int bytes_read = 0;
-	int buf_type = CIFS_NO_BUFFER;
+	u8 *buf = NULL;
 	unsigned int link_len = 0;
-	FILE_ALL_INFO file_info;
+	unsigned int bytes_read = 0;
 
-	if (!CIFSCouldBeMFSymlink(fattr))
+	if (!couldbe_mf_symlink(fattr))
 		/* it's not a symlink */
 		return 0;
 
-	tlink = cifs_sb_tlink(cifs_sb);
-	if (IS_ERR(tlink))
-		return PTR_ERR(tlink);
-	pTcon = tlink_tcon(tlink);
-
-	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
-			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
-			 cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc != 0)
-		goto out;
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
-		CIFSSMBClose(xid, pTcon, netfid);
-		/* it's not a symlink */
-		goto out;
-	}
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+					      cifs_sb, path, buf, &bytes_read);
+	else
+		rc = -ENOSYS;
 
-	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
-	if (!buf) {
-		rc = -ENOMEM;
+	if (rc)
 		goto out;
-	}
-	pbuf = buf;
 
-	rc = CIFSSMBRead(xid, pTcon, netfid,
-			 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
-			 0 /* offset */,
-			 &bytes_read, &pbuf, &buf_type);
-	CIFSSMBClose(xid, pTcon, netfid);
-	if (rc != 0) {
-		kfree(buf);
+	if (bytes_read == 0) /* not a symlink */
 		goto out;
-	}
 
-	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
-	kfree(buf);
+	rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
 	if (rc == -EINVAL) {
 		/* it's not a symlink */
 		rc = 0;
@@ -356,81 +305,188 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
 	fattr->cf_dtype = DT_LNK;
 out:
-	cifs_put_tlink(tlink);
+	kfree(buf);
 	return rc;
 }
 
+/*
+ * SMB 1.0 Protocol specific functions
+ */
+
+int
+cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		      struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		      char *pbuf, unsigned int *pbytes_read)
+{
+	int rc;
+	int oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	int buf_type = CIFS_NO_BUFFER;
+	FILE_ALL_INFO file_info;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_READ;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+	if (rc)
+		return rc;
+
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+		/* it's not a symlink */
+		goto out;
+
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+out:
+	CIFSSMBClose(xid, tcon, fid.netfid);
+	return rc;
+}
+
+int
+cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+		       struct cifs_sb_info *cifs_sb, const unsigned char *path,
+		       char *pbuf, unsigned int *pbytes_written)
+{
+	int rc;
+	int oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms;
+	int create_options = CREATE_NOT_DIR;
+
+	if (backup_cred(cifs_sb))
+		create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = GENERIC_WRITE;
+	oparms.create_options = create_options;
+	oparms.disposition = FILE_CREATE;
+	oparms.path = path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc)
+		return rc;
+
+	io_parms.netfid = fid.netfid;
+	io_parms.pid = current->tgid;
+	io_parms.tcon = tcon;
+	io_parms.offset = 0;
+	io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+	rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+	CIFSSMBClose(xid, tcon, fid.netfid);
+	return rc;
+}
+
+/*
+ * M-F Symlink Functions - End
+ */
+
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	      struct dentry *direntry)
 {
 	int rc = -EACCES;
-	int xid;
-	char *fromName = NULL;
-	char *toName = NULL;
+	unsigned int xid;
+	char *from_name = NULL;
+	char *to_name = NULL;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 	struct cifsInodeInfo *cifsInode;
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
-	pTcon = tlink_tcon(tlink);
+	tcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
-	fromName = build_path_from_dentry(old_file);
-	toName = build_path_from_dentry(direntry);
-	if ((fromName == NULL) || (toName == NULL)) {
+	from_name = build_path_from_dentry(old_file);
+	to_name = build_path_from_dentry(direntry);
+	if ((from_name == NULL) || (to_name == NULL)) {
 		rc = -ENOMEM;
 		goto cifs_hl_exit;
 	}
 
-	if (pTcon->unix_ext)
-		rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
+	if (tcon->unix_ext)
+		rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else {
-		rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		server = tcon->ses->server;
+		if (!server->ops->create_hardlink) {
+			rc = -ENOSYS;
+			goto cifs_hl_exit;
+		}
+		rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
+						  cifs_sb);
 		if ((rc == -EIO) || (rc == -EINVAL))
 			rc = -EOPNOTSUPP;
 	}
 
 	d_drop(direntry);	/* force new lookup from server of target */
 
-	/* if source file is cached (oplocked) revalidate will not go to server
-	   until the file is closed or oplock broken so update nlinks locally */
+	/*
+	 * if source file is cached (oplocked) revalidate will not go to server
+	 * until the file is closed or oplock broken so update nlinks locally
+	 */
 	if (old_file->d_inode) {
 		cifsInode = CIFS_I(old_file->d_inode);
 		if (rc == 0) {
-			old_file->d_inode->i_nlink++;
-/* BB should we make this contingent on superblock flag NOATIME? */
-/*			old_file->d_inode->i_ctime = CURRENT_TIME;*/
-			/* parent dir timestamps will update from srv
-			within a second, would it really be worth it
-			to set the parent dir cifs inode time to zero
-			to force revalidate (faster) for it too? */
+			spin_lock(&old_file->d_inode->i_lock);
+			inc_nlink(old_file->d_inode);
+			spin_unlock(&old_file->d_inode->i_lock);
+			/*
+			 * BB should we make this contingent on superblock flag
+			 * NOATIME?
+			 */
+			/* old_file->d_inode->i_ctime = CURRENT_TIME; */
+			/*
+			 * parent dir timestamps will update from srv within a
+			 * second, would it really be worth it to set the parent
+			 * dir cifs inode time to zero to force revalidate
+			 * (faster) for it too?
+			 */
 		}
-		/* if not oplocked will force revalidate to get info
-		   on source file from srv */
+		/*
+		 * if not oplocked will force revalidate to get info on source
+		 * file from srv
+		 */
 		cifsInode->time = 0;
 
-		/* Will update parent dir timestamps from srv within a second.
-		   Would it really be worth it to set the parent dir (cifs
-		   inode) time field to zero to force revalidate on parent
-		   directory faster ie
-			CIFS_I(inode)->time = 0;  */
+		/*
+		 * Will update parent dir timestamps from srv within a second.
+		 * Would it really be worth it to set the parent dir (cifs
+		 * inode) time field to zero to force revalidate on parent
+		 * directory faster ie
+		 *
+		 * CIFS_I(inode)->time = 0;
+		 */
 	}
 
 cifs_hl_exit:
-	kfree(fromName);
-	kfree(toName);
-	FreeXid(xid);
+	kfree(from_name);
+	kfree(to_name);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -440,14 +496,15 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 {
 	struct inode *inode = direntry->d_inode;
 	int rc = -ENOMEM;
-	int xid;
+	unsigned int xid;
 	char *full_path = NULL;
 	char *target_path = NULL;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = NULL;
-	struct cifsTconInfo *tcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
@@ -456,31 +513,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 		goto out;
 	}
 	tcon = tlink_tcon(tlink);
-
-	/*
-	 * For now, we just handle symlinks with unix extensions enabled.
-	 * Eventually we should handle NTFS reparse points, and MacOS
-	 * symlink support. For instance...
-	 *
-	 * rc = CIFSSMBQueryReparseLinkInfo(...)
-	 *
-	 * For now, just return -EACCES when the server doesn't support posix
-	 * extensions. Note that we still allow querying symlinks when posix
-	 * extensions are manually disabled. We could disable these as well
-	 * but there doesn't seem to be any harm in allowing the client to
-	 * read them.
-	 */
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-	    && !(tcon->ses->capabilities & CAP_UNIX)) {
-		rc = -EACCES;
-		goto out;
-	}
+	server = tcon->ses->server;
 
 	full_path = build_path_from_dentry(direntry);
 	if (!full_path)
 		goto out;
 
-	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
+	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
 
 	rc = -EACCES;
 	/*
@@ -488,14 +527,12 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	 * and fallback to UNIX Extensions Symlinks.
 	 */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
+				      &target_path);
 
-	if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
-		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-					     cifs_sb->local_nls);
+	if (rc != 0 && server->ops->query_symlink)
+		rc = server->ops->query_symlink(xid, tcon, full_path,
+						&target_path, cifs_sb);
 
 	kfree(full_path);
 out:
@@ -504,7 +541,7 @@ out:
 		target_path = ERR_PTR(rc);
 	}
 
-	FreeXid(xid);
+	free_xid(xid);
 	if (tlink)
 		cifs_put_tlink(tlink);
 	nd_set_link(nd, target_path);
@@ -515,14 +552,14 @@ int
 cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 {
 	int rc = -EOPNOTSUPP;
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
@@ -537,15 +574,12 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 		goto symlink_exit;
 	}
 
-	cFYI(1, "Full path: %s", full_path);
-	cFYI(1, "symname is %s", symname);
+	cifs_dbg(FYI, "Full path: %s\n", full_path);
+	cifs_dbg(FYI, "symname is %s\n", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
-		rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
-					 cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
 	else if (pTcon->unix_ext)
 		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
 					   cifs_sb->local_nls);
@@ -562,8 +596,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 						 inode->i_sb, xid, NULL);
 
 		if (rc != 0) {
-			cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
-			      rc);
+			cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n",
+				 rc);
 		} else {
 			d_instantiate(direntry, newinode);
 		}
@@ -571,13 +605,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 symlink_exit:
 	kfree(full_path);
 	cifs_put_tlink(tlink);
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
-
-void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
-{
-	char *p = nd_get_link(nd);
-	if (!IS_ERR(p))
-		kfree(p);
-}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2a930a752a7..3b0c62e622d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -29,6 +29,9 @@
 #include "smberr.h"
 #include "nterr.h"
 #include "cifs_unicode.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2pdu.h"
+#endif
 
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
@@ -40,7 +43,7 @@ extern mempool_t *cifs_req_poolp;
    since the cifs fs was mounted */
 
 unsigned int
-_GetXid(void)
+_get_xid(void)
 {
 	unsigned int xid;
 
@@ -51,14 +54,14 @@ _GetXid(void)
 	if (GlobalTotalActiveXid > GlobalMaxActiveXid)
 		GlobalMaxActiveXid = GlobalTotalActiveXid;
 	if (GlobalTotalActiveXid > 65000)
-		cFYI(1, "warning: more than 65000 requests active");
+		cifs_dbg(FYI, "warning: more than 65000 requests active\n");
 	xid = GlobalCurrentXid++;
 	spin_unlock(&GlobalMid_Lock);
 	return xid;
 }
 
 void
-_FreeXid(unsigned int xid)
+_free_xid(unsigned int xid)
 {
 	spin_lock(&GlobalMid_Lock);
 	/* if (GlobalTotalActiveXid == 0)
@@ -67,12 +70,12 @@ _FreeXid(unsigned int xid)
 	spin_unlock(&GlobalMid_Lock);
 }
 
-struct cifsSesInfo *
+struct cifs_ses *
 sesInfoAlloc(void)
 {
-	struct cifsSesInfo *ret_buf;
+	struct cifs_ses *ret_buf;
 
-	ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
+	ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
 	if (ret_buf) {
 		atomic_inc(&sesInfoAllocCount);
 		ret_buf->status = CifsNew;
@@ -85,10 +88,10 @@ sesInfoAlloc(void)
 }
 
 void
-sesInfoFree(struct cifsSesInfo *buf_to_free)
+sesInfoFree(struct cifs_ses *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, "Null buffer passed to sesInfoFree");
+		cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
 		return;
 	}
 
@@ -100,15 +103,17 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
 		memset(buf_to_free->password, 0, strlen(buf_to_free->password));
 		kfree(buf_to_free->password);
 	}
+	kfree(buf_to_free->user_name);
 	kfree(buf_to_free->domainName);
+	kfree(buf_to_free->auth_key.response);
 	kfree(buf_to_free);
 }
 
-struct cifsTconInfo *
+struct cifs_tcon *
 tconInfoAlloc(void)
 {
-	struct cifsTconInfo *ret_buf;
-	ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
+	struct cifs_tcon *ret_buf;
+	ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
 	if (ret_buf) {
 		atomic_inc(&tconInfoAllocCount);
 		ret_buf->tidStatus = CifsNew;
@@ -123,10 +128,10 @@ tconInfoAlloc(void)
 }
 
 void
-tconInfoFree(struct cifsTconInfo *buf_to_free)
+tconInfoFree(struct cifs_tcon *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, "Null buffer passed to tconInfoFree");
+		cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
 		return;
 	}
 	atomic_dec(&tconInfoAllocCount);
@@ -142,17 +147,27 @@ struct smb_hdr *
 cifs_buf_get(void)
 {
 	struct smb_hdr *ret_buf = NULL;
-
-/* We could use negotiated size instead of max_msgsize -
-   but it may be more efficient to always alloc same size
-   albeit slightly larger than necessary and maxbuffersize
-   defaults to this and can not be bigger */
+	size_t buf_size = sizeof(struct smb_hdr);
+
+#ifdef CONFIG_CIFS_SMB2
+	/*
+	 * SMB2 header is bigger than CIFS one - no problems to clean some
+	 * more bytes for CIFS.
+	 */
+	buf_size = sizeof(struct smb2_hdr);
+#endif
+	/*
+	 * We could use negotiated size instead of max_msgsize -
+	 * but it may be more efficient to always alloc same size
+	 * albeit slightly larger than necessary and maxbuffersize
+	 * defaults to this and can not be bigger.
+	 */
 	ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
 
 	/* clear the first few header bytes */
 	/* for most paths, more is cleared in header_assemble */
 	if (ret_buf) {
-		memset(ret_buf, 0, sizeof(struct smb_hdr) + 3);
+		memset(ret_buf, 0, buf_size + 3);
 		atomic_inc(&bufAllocCount);
 #ifdef CONFIG_CIFS_STATS2
 		atomic_inc(&totBufAllocCount);
@@ -166,7 +181,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		/* cFYI(1, "Null buffer passed to cifs_buf_release");*/
+		/* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/
 		return;
 	}
 	mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +217,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
 
 	if (buf_to_free == NULL) {
-		cFYI(1, "Null buffer passed to cifs_small_buf_release");
+		cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n");
 		return;
 	}
 	mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -211,104 +226,21 @@ cifs_small_buf_release(void *buf_to_free)
 	return;
 }
 
-/*
-	Find a free multiplex id (SMB mid). Otherwise there could be
-	mid collisions which might cause problems, demultiplexing the
-	wrong response to this request. Multiplex ids could collide if
-	one of a series requests takes much longer than the others, or
-	if a very large number of long lived requests (byte range
-	locks or FindNotify requests) are pending.  No more than
-	64K-1 requests can be outstanding at one time.  If no
-	mids are available, return zero.  A future optimization
-	could make the combination of mids and uid the key we use
-	to demultiplex on (rather than mid alone).
-	In addition to the above check, the cifs demultiplex
-	code already used the command code as a secondary
-	check of the frame and if signing is negotiated the
-	response would be discarded if the mid were the same
-	but the signature was wrong.  Since the mid is not put in the
-	pending queue until later (when it is about to be dispatched)
-	we do have to limit the number of outstanding requests
-	to somewhat less than 64K-1 although it is hard to imagine
-	so many threads being in the vfs at one time.
-*/
-__u16 GetNextMid(struct TCP_Server_Info *server)
-{
-	__u16 mid = 0;
-	__u16 last_mid;
-	bool collision;
-
-	spin_lock(&GlobalMid_Lock);
-	last_mid = server->CurrentMid; /* we do not want to loop forever */
-	server->CurrentMid++;
-	/* This nested loop looks more expensive than it is.
-	In practice the list of pending requests is short,
-	fewer than 50, and the mids are likely to be unique
-	on the first pass through the loop unless some request
-	takes longer than the 64 thousand requests before it
-	(and it would also have to have been a request that
-	 did not time out) */
-	while (server->CurrentMid != last_mid) {
-		struct mid_q_entry *mid_entry;
-		unsigned int num_mids;
-
-		collision = false;
-		if (server->CurrentMid == 0)
-			server->CurrentMid++;
-
-		num_mids = 0;
-		list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
-			++num_mids;
-			if (mid_entry->mid == server->CurrentMid &&
-			    mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				/* This mid is in use, try a different one */
-				collision = true;
-				break;
-			}
-		}
-
-		/*
-		 * if we have more than 32k mids in the list, then something
-		 * is very wrong. Possibly a local user is trying to DoS the
-		 * box by issuing long-running calls and SIGKILL'ing them. If
-		 * we get to 2^16 mids then we're in big trouble as this
-		 * function could loop forever.
-		 *
-		 * Go ahead and assign out the mid in this situation, but force
-		 * an eventual reconnect to clean out the pending_mid_q.
-		 */
-		if (num_mids > 32768)
-			server->tcpStatus = CifsNeedReconnect;
-
-		if (!collision) {
-			mid = server->CurrentMid;
-			break;
-		}
-		server->CurrentMid++;
-	}
-	spin_unlock(&GlobalMid_Lock);
-	return mid;
-}
-
 /* NB: MID can not be set if treeCon not passed in, in that
    case it is responsbility of caller to set the mid */
 void
 header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
-		const struct cifsTconInfo *treeCon, int word_count
+		const struct cifs_tcon *treeCon, int word_count
 		/* length of fixed section (word count) in two byte units  */)
 {
-	struct list_head *temp_item;
-	struct cifsSesInfo *ses;
 	char *temp = (char *) buffer;
 
 	memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
 
-	buffer->smb_buf_length =
+	buffer->smb_buf_length = cpu_to_be32(
 	    (2 * word_count) + sizeof(struct smb_hdr) -
 	    4 /*  RFC 1001 length field does not count */  +
-	    2 /* for bcc field itself */ ;
-	/* Note that this is the only network field that has to be converted
-	   to big endian and it is done just before we send it */
+	    2 /* for bcc field itself */) ;
 
 	buffer->Protocol[0] = 0xFF;
 	buffer->Protocol[1] = 'S';
@@ -329,60 +261,14 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 
 			/* Uid is not converted */
 			buffer->Uid = treeCon->ses->Suid;
-			buffer->Mid = GetNextMid(treeCon->ses->server);
-			if (multiuser_mount != 0) {
-		/* For the multiuser case, there are few obvious technically  */
-		/* possible mechanisms to match the local linux user (uid)    */
-		/* to a valid remote smb user (smb_uid):		      */
-		/* 	1) Query Winbind (or other local pam/nss daemon       */
-		/* 	  for userid/password/logon_domain or credential      */
-		/*      2) Query Winbind for uid to sid to username mapping   */
-		/* 	   and see if we have a matching password for existing*/
-		/*         session for that user perhas getting password by   */
-		/*         adding a new pam_cifs module that stores passwords */
-		/*         so that the cifs vfs can get at that for all logged*/
-		/*	   on users					      */
-		/*	3) (Which is the mechanism we have chosen)	      */
-		/*	   Search through sessions to the same server for a   */
-		/*	   a match on the uid that was passed in on mount     */
-		/*         with the current processes uid (or euid?) and use  */
-		/* 	   that smb uid.   If no existing smb session for     */
-		/* 	   that uid found, use the default smb session ie     */
-		/*         the smb session for the volume mounted which is    */
-		/* 	   the same as would be used if the multiuser mount   */
-		/* 	   flag were disabled.  */
-
-		/*  BB Add support for establishing new tCon and SMB Session  */
-		/*      with userid/password pairs found on the smb session   */
-		/*	for other target tcp/ip addresses 		BB    */
-				if (current_fsuid() != treeCon->ses->linux_uid) {
-					cFYI(1, "Multiuser mode and UID "
-						 "did not match tcon uid");
-					spin_lock(&cifs_tcp_ses_lock);
-					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
-						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
-						if (ses->linux_uid == current_fsuid()) {
-							if (ses->server == treeCon->ses->server) {
-								cFYI(1, "found matching uid substitute right smb_uid");
-								buffer->Uid = ses->Suid;
-								break;
-							} else {
-				/* BB eventually call cifs_setup_session here */
-								cFYI(1, "local UID found but no smb sess with this server exists");
-							}
-						}
-					}
-					spin_unlock(&cifs_tcp_ses_lock);
-				}
-			}
+			buffer->Mid = get_next_mid(treeCon->ses->server);
 		}
 		if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
 			buffer->Flags2 |= SMBFLG2_DFS;
 		if (treeCon->nocase)
 			buffer->Flags  |= SMBFLG_CASELESS;
 		if ((treeCon->ses) && (treeCon->ses->server))
-			if (treeCon->ses->server->secMode &
-			  (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			if (treeCon->ses->server->sign)
 				buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 	}
 
@@ -392,19 +278,12 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 
 static int
-check_smb_hdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb)
 {
 	/* does it have the right SMB "signature" ? */
 	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
-		cERROR(1, "Bad protocol string signature header 0x%x",
-			*(unsigned int *)smb->Protocol);
-		return 1;
-	}
-
-	/* Make sure that message ids match */
-	if (mid != smb->Mid) {
-		cERROR(1, "Mids do not match. received=%u expected=%u",
-			smb->Mid, mid);
+		cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n",
+			 *(unsigned int *)smb->Protocol);
 		return 1;
 	}
 
@@ -416,24 +295,29 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 	if (smb->Command == SMB_COM_LOCKING_ANDX)
 		return 0;
 
-	cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
+	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
+		 get_mid(smb));
 	return 1;
 }
 
 int
-checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
+checkSMB(char *buf, unsigned int total_read)
 {
-	__u32 len = smb->smb_buf_length;
+	struct smb_hdr *smb = (struct smb_hdr *)buf;
+	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
 	__u32 clc_len;  /* calculated length */
-	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
+	cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
+		 total_read, rfclen);
 
-	if (length < 2 + sizeof(struct smb_hdr)) {
-		if ((length >= sizeof(struct smb_hdr) - 1)
+	/* is this frame too small to even get to a BCC? */
+	if (total_read < 2 + sizeof(struct smb_hdr)) {
+		if ((total_read >= sizeof(struct smb_hdr) - 1)
 			    && (smb->Status.CifsError != 0)) {
+			/* it's an error return */
 			smb->WordCount = 0;
 			/* some error cases do not return wct and bcc */
 			return 0;
-		} else if ((length == sizeof(struct smb_hdr) + 1) &&
+		} else if ((total_read == sizeof(struct smb_hdr) + 1) &&
 				(smb->WordCount == 0)) {
 			char *tmp = (char *)smb;
 			/* Need to work around a bug in two servers here */
@@ -449,43 +333,40 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 				tmp[sizeof(struct smb_hdr)+1] = 0;
 				return 0;
 			}
-			cERROR(1, "rcvd invalid byte count (bcc)");
+			cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n");
 		} else {
-			cERROR(1, "Length less than smb header size");
+			cifs_dbg(VFS, "Length less than smb header size\n");
 		}
-		return 1;
-	}
-	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, "smb length greater than MaxBufSize, mid=%d",
-				   smb->Mid);
-		return 1;
+		return -EIO;
 	}
 
-	if (check_smb_hdr(smb, mid))
-		return 1;
-	clc_len = smbCalcSize_LE(smb);
+	/* otherwise, there is enough to get to the BCC */
+	if (check_smb_hdr(smb))
+		return -EIO;
+	clc_len = smbCalcSize(smb);
 
-	if (4 + len != length) {
-		cERROR(1, "Length read does not match RFC1001 length %d",
-			   len);
-		return 1;
+	if (4 + rfclen != total_read) {
+		cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
+			 rfclen);
+		return -EIO;
 	}
 
-	if (4 + len != clc_len) {
+	if (4 + rfclen != clc_len) {
+		__u16 mid = get_mid(smb);
 		/* check if bcc wrapped around for large read responses */
-		if ((len > 64 * 1024) && (len > clc_len)) {
+		if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
 			/* check if lengths match mod 64K */
-			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
+			if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
-		cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
-				clc_len, 4 + len, smb->Mid);
-
-		if (4 + len < clc_len) {
-			cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-					len, smb->Mid);
-			return 1;
-		} else if (len > clc_len + 512) {
+		cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
+			 clc_len, 4 + rfclen, mid);
+
+		if (4 + rfclen < clc_len) {
+			cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
+				 rfclen, mid);
+			return -EIO;
+		} else if (rfclen > clc_len + 512) {
 			/*
 			 * Some servers (Windows XP in particular) send more
 			 * data than the lengths in the SMB packet would
@@ -495,45 +376,46 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 			 * trailing data, we choose limit the amount of extra
 			 * data to 512 bytes.
 			 */
-			cERROR(1, "RFC1001 size %u more than 512 bytes larger "
-				  "than SMB for mid=%u", len, smb->Mid);
-			return 1;
+			cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
+				 rfclen, mid);
+			return -EIO;
 		}
 	}
 	return 0;
 }
 
 bool
-is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
+is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 {
+	struct smb_hdr *buf = (struct smb_hdr *)buffer;
 	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
 	struct list_head *tmp, *tmp1, *tmp2;
-	struct cifsSesInfo *ses;
-	struct cifsTconInfo *tcon;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
 	struct cifsInodeInfo *pCifsInode;
 	struct cifsFileInfo *netfile;
 
-	cFYI(1, "Checking for oplock break or dnotify response");
+	cifs_dbg(FYI, "Checking for oplock break or dnotify response\n");
 	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
 	   (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
 		struct smb_com_transaction_change_notify_rsp *pSMBr =
 			(struct smb_com_transaction_change_notify_rsp *)buf;
 		struct file_notify_information *pnotify;
 		__u32 data_offset = 0;
-		if (pSMBr->ByteCount > sizeof(struct file_notify_information)) {
+		if (get_bcc(buf) > sizeof(struct file_notify_information)) {
 			data_offset = le32_to_cpu(pSMBr->DataOffset);
 
 			pnotify = (struct file_notify_information *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
-			cFYI(1, "dnotify on %s Action: 0x%x",
+			cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n",
 				 pnotify->FileName, pnotify->Action);
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
 			return true;
 		}
 		if (pSMBr->hdr.Status.CifsError) {
-			cFYI(1, "notify err 0x%d",
-				pSMBr->hdr.Status.CifsError);
+			cifs_dbg(FYI, "notify err 0x%d\n",
+				 pSMBr->hdr.Status.CifsError);
 			return true;
 		}
 		return false;
@@ -547,7 +429,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		   large dirty files cached on the client */
 		if ((NT_STATUS_INVALID_HANDLE) ==
 		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-			cFYI(1, "invalid handle on oplock break");
+			cifs_dbg(FYI, "invalid handle on oplock break\n");
 			return true;
 		} else if (ERRbadfid ==
 		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -559,7 +441,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	if (pSMB->hdr.WordCount != 8)
 		return false;
 
-	cFYI(1, "oplock type 0x%d level 0x%d",
+	cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n",
 		 pSMB->LockType, pSMB->OplockLevel);
 	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
 		return false;
@@ -567,34 +449,41 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	/* look up tcon based on tid & uid */
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &srv->smb_ses_list) {
-		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
 		list_for_each(tmp1, &ses->tcon_list) {
-			tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+			tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
 			if (tcon->tid != buf->Tid)
 				continue;
 
-			cifs_stats_inc(&tcon->num_oplock_brks);
+			cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
 			spin_lock(&cifs_file_list_lock);
 			list_for_each(tmp2, &tcon->openFileList) {
 				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
-				if (pSMB->Fid != netfile->netfid)
+				if (pSMB->Fid != netfile->fid.netfid)
 					continue;
 
-				cFYI(1, "file id match, oplock break");
+				cifs_dbg(FYI, "file id match, oplock break\n");
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 
-				cifs_set_oplock_level(pCifsInode,
-					pSMB->OplockLevel ? OPLOCK_READ : 0);
+				set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+					&pCifsInode->flags);
+
 				/*
-				 * cifs_oplock_break_put() can't be called
-				 * from here.  Get reference after queueing
-				 * succeeded.  cifs_oplock_break() will
-				 * synchronize using cifs_file_list_lock.
+				 * Set flag if the server downgrades the oplock
+				 * to L2 else clear.
 				 */
-				if (queue_work(system_nrt_wq,
-					       &netfile->oplock_break))
-					cifs_oplock_break_get(netfile);
+				if (pSMB->OplockLevel)
+					set_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &pCifsInode->flags);
+				else
+					clear_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &pCifsInode->flags);
+
+				queue_work(cifsiod_wq,
+					   &netfile->oplock_break);
 				netfile->oplock_break_cancelled = false;
 
 				spin_unlock(&cifs_file_list_lock);
@@ -603,26 +492,25 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 			}
 			spin_unlock(&cifs_file_list_lock);
 			spin_unlock(&cifs_tcp_ses_lock);
-			cFYI(1, "No matching file for oplock break");
+			cifs_dbg(FYI, "No matching file for oplock break\n");
 			return true;
 		}
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
-	cFYI(1, "Can not process oplock break for non-existent connection");
+	cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
 	return true;
 }
 
 void
-dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
+dump_smb(void *buf, int smb_buf_length)
 {
 	int i, j;
 	char debug_line[17];
-	unsigned char *buffer;
+	unsigned char *buffer = buf;
 
 	if (traceSMB == 0)
 		return;
 
-	buffer = (unsigned char *) smb_buf;
 	for (i = 0, j = 0; i < smb_buf_length; i++, j++) {
 		if (i % 8 == 0) {
 			/* have reached the beginning of line */
@@ -656,12 +544,8 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-		cERROR(1, "Autodisabling the use of server inode numbers on "
-			   "%s. This server doesn't seem to support them "
-			   "properly. Hardlinks will not be recognized on this "
-			   "mount. Consider mounting with the \"noserverino\" "
-			   "option to silence this message.",
-			   cifs_sb_master_tcon(cifs_sb)->treeName);
+		cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n",
+			 cifs_sb_master_tcon(cifs_sb)->treeName);
 	}
 }
 
@@ -670,17 +554,114 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 	oplock &= 0xF;
 
 	if (oplock == OPLOCK_EXCLUSIVE) {
-		cinode->clientCanCacheAll = true;
-		cinode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p",
-		     &cinode->vfs_inode);
+		cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
 	} else if (oplock == OPLOCK_READ) {
-		cinode->clientCanCacheAll = false;
-		cinode->clientCanCacheRead = true;
-		cFYI(1, "Level II Oplock granted on inode %p",
-		    &cinode->vfs_inode);
-	} else {
-		cinode->clientCanCacheAll = false;
-		cinode->clientCanCacheRead = false;
+		cinode->oplock = CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else
+		cinode->oplock = 0;
+}
+
+static int
+cifs_oplock_break_wait(void *unused)
+{
+	schedule();
+	return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/*
+ * We wait for oplock breaks to be processed before we attempt to perform
+ * writes.
+ */
+int cifs_get_writer(struct cifsInodeInfo *cinode)
+{
+	int rc;
+
+start:
+	rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
+				   cifs_oplock_break_wait, TASK_KILLABLE);
+	if (rc)
+		return rc;
+
+	spin_lock(&cinode->writers_lock);
+	if (!cinode->writers)
+		set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+	cinode->writers++;
+	/* Check to see if we have started servicing an oplock break */
+	if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
+		cinode->writers--;
+		if (cinode->writers == 0) {
+			clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+			wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+		}
+		spin_unlock(&cinode->writers_lock);
+		goto start;
+	}
+	spin_unlock(&cinode->writers_lock);
+	return 0;
+}
+
+void cifs_put_writer(struct cifsInodeInfo *cinode)
+{
+	spin_lock(&cinode->writers_lock);
+	cinode->writers--;
+	if (cinode->writers == 0) {
+		clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+		wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+	}
+	spin_unlock(&cinode->writers_lock);
+}
+
+void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
+{
+	clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+	wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
+}
+
+bool
+backup_cred(struct cifs_sb_info *cifs_sb)
+{
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
+		if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid()))
+			return true;
+	}
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
+		if (in_group_p(cifs_sb->mnt_backupgid))
+			return true;
 	}
+
+	return false;
+}
+
+void
+cifs_del_pending_open(struct cifs_pending_open *open)
+{
+	spin_lock(&cifs_file_list_lock);
+	list_del(&open->olist);
+	spin_unlock(&cifs_file_list_lock);
+}
+
+void
+cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink,
+			     struct cifs_pending_open *open)
+{
+#ifdef CONFIG_CIFS_SMB2
+	memcpy(open->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
+#endif
+	open->oplock = CIFS_OPLOCK_NO_CHANGE;
+	open->tlink = tlink;
+	fid->pending_open = open;
+	list_add_tail(&open->olist, &tlink_tcon(tlink)->pending_opens);
+}
+
+void
+cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
+		      struct cifs_pending_open *open)
+{
+	spin_lock(&cifs_file_list_lock);
+	cifs_add_pending_open_locked(fid, tlink, open);
+	spin_unlock(&cifs_file_list_lock);
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 79f641eeda3..6834b9c3bec 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -51,7 +51,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 	{ERRnoaccess, -EACCES},
 	{ERRbadfid, -EBADF},
 	{ERRbadmcb, -EIO},
-	{ERRnomem, -ENOMEM},
+	{ERRnomem, -EREMOTEIO},
 	{ERRbadmem, -EFAULT},
 	{ERRbadenv, -EFAULT},
 	{ERRbadformat, -EINVAL},
@@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 	{ERRdiffdevice, -EXDEV},
 	{ERRnofiles, -ENOENT},
 	{ERRwriteprot, -EROFS},
-	{ERRbadshare, -ETXTBSY},
+	{ERRbadshare, -EBUSY},
 	{ERRlock, -EACCES},
 	{ERRunsup, -EINVAL},
 	{ERRnosuchshare, -ENXIO},
@@ -110,7 +110,7 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
 	{ERRnoroom, -ENOSPC},
 	{ERRrmuns, -EUSERS},
 	{ERRtimeout, -ETIME},
-	{ERRnoresource, -ENOBUFS},
+	{ERRnoresource, -EREMOTEIO},
 	{ERRtoomanyuids, -EUSERS},
 	{ERRbaduid, -EACCES},
 	{ERRusempx, -EIO},
@@ -150,8 +150,8 @@ cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
 	else if (address_family == AF_INET6)
 		ret = in6_pton(cp, len, dst , '\\', NULL);
 
-	cFYI(DBG2, "address conversion returned %d for %*.*s",
-	     ret, len, len, cp);
+	cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n",
+		 ret, len, len, cp);
 	if (ret > 0)
 		ret = 1;
 	return ret;
@@ -197,15 +197,14 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 		memcpy(scope_id, pct + 1, slen);
 		scope_id[slen] = '\0';
 
-		rc = strict_strtoul(scope_id, 0,
-					(unsigned long *)&s6->sin6_scope_id);
+		rc = kstrtouint(scope_id, 0, &s6->sin6_scope_id);
 		rc = (rc == 0) ? 1 : 0;
 	}
 
 	return rc;
 }
 
-int
+void
 cifs_set_port(struct sockaddr *addr, const unsigned short int port)
 {
 	switch (addr->sa_family) {
@@ -215,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
 	case AF_INET6:
 		((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
 		break;
-	default:
-		return 0;
 	}
-	return 1;
-}
-
-int
-cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-		   const unsigned short int port)
-{
-	if (!cifs_convert_address(dst, src, len))
-		return 0;
-	return cifs_set_port(dst, port);
 }
 
 /*****************************************************************************
@@ -413,7 +400,7 @@ static const struct {
 	 from NT_STATUS_INSUFFICIENT_RESOURCES to
 	 NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */
 	{
-	ERRDOS, ERRnomem, NT_STATUS_INSUFFICIENT_RESOURCES}, {
+	ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, {
 	ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, {
 	ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, {
 	ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, {
@@ -683,7 +670,7 @@ static const struct {
 	ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, {
 	ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, {
 	ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, {
-	ERRDOS, ERRnomem, NT_STATUS_INSUFF_SERVER_RESOURCES}, {
+	ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, {
 	ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, {
 	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, {
 	ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, {
@@ -793,7 +780,9 @@ static const struct {
 	ERRDOS, ERRnoaccess, 0xc0000290}, {
 	ERRDOS, ERRbadfunc, 0xc000029c}, {
 	ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
-	ERRDOS, ERRinvlevel, 0x007c0001}, };
+	ERRDOS, ERRinvlevel, 0x007c0001}, {
+	0, 0, 0 }
+};
 
 /*****************************************************************************
  Print an error message from the status code
@@ -806,8 +795,8 @@ cifs_print_status(__u32 status_code)
 	while (nt_errs[idx].nt_errstr != NULL) {
 		if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) ==
 		    (status_code & 0xFFFFFF)) {
-			printk(KERN_NOTICE "Status code returned 0x%08x %s\n",
-				   status_code, nt_errs[idx].nt_errstr);
+			pr_notice("Status code returned 0x%08x %s\n",
+				  status_code, nt_errs[idx].nt_errstr);
 		}
 		idx++;
 	}
@@ -836,8 +825,9 @@ ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
 }
 
 int
-map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
+map_smb_to_linux_error(char *buf, bool logErr)
 {
+	struct smb_hdr *smb = (struct smb_hdr *)buf;
 	unsigned int i;
 	int rc = -EIO;	/* if transport error smb error may not be set */
 	__u8 smberrclass;
@@ -899,7 +889,7 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
-	cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+	cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
 		 le32_to_cpu(smb->Status.CifsError), rc);
 
 	/* generic corrective action e.g. reconnect SMB session on
@@ -913,19 +903,13 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
  * portion, the number of word parameters and the data portion of the message
  */
 unsigned int
-smbCalcSize(struct smb_hdr *ptr)
+smbCalcSize(void *buf)
 {
+	struct smb_hdr *ptr = (struct smb_hdr *)buf;
 	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
 		2 /* size of the bcc field */ + get_bcc(ptr));
 }
 
-unsigned int
-smbCalcSize_LE(struct smb_hdr *ptr)
-{
-	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
-		2 /* size of the bcc field */ + get_bcc_le(ptr));
-}
-
 /* The following are taken from fs/ntfs/util.c */
 
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
@@ -957,8 +941,9 @@ cifs_UnixTimeToNT(struct timespec t)
 	return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
 }
 
-static int total_days_of_prev_months[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+static const int total_days_of_prev_months[] = {
+	0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
+};
 
 struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 {
@@ -969,20 +954,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
-	cFYI(1, "date %d time %d", date, time);
+	cifs_dbg(FYI, "date %d time %d\n", date, time);
 
 	sec = 2 * st->TwoSeconds;
 	min = st->Minutes;
 	if ((sec > 59) || (min > 59))
-		cERROR(1, "illegal time min %d sec %d", min, sec);
+		cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec);
 	sec += (min * 60);
 	sec += 60 * 60 * st->Hours;
 	if (st->Hours > 24)
-		cERROR(1, "illegal hours %d", st->Hours);
+		cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
 	days = sd->Day;
 	month = sd->Month;
 	if ((days > 31) || (month > 12)) {
-		cERROR(1, "illegal date, month %d day: %d", month, days);
+		cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
 		if (month > 12)
 			month = 12;
 	}
@@ -1008,7 +993,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 
 	ts.tv_sec = sec + offset;
 
-	/* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
+	/* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */
 
 	ts.tv_nsec = 0;
 	return ts;
diff --git a/fs/cifs/nterr.c b/fs/cifs/nterr.c
index 819fd994b12..b6023c64612 100644
--- a/fs/cifs/nterr.c
+++ b/fs/cifs/nterr.c
@@ -31,7 +31,7 @@ const struct nt_err_code_struct nt_errs[] = {
 	{"NT_STATUS_INVALID_INFO_CLASS", NT_STATUS_INVALID_INFO_CLASS},
 	{"NT_STATUS_INFO_LENGTH_MISMATCH", NT_STATUS_INFO_LENGTH_MISMATCH},
 	{"NT_STATUS_ACCESS_VIOLATION", NT_STATUS_ACCESS_VIOLATION},
-	{"STATUS_BUFFER_OVERFLOW", STATUS_BUFFER_OVERFLOW},
+	{"NT_STATUS_BUFFER_OVERFLOW", NT_STATUS_BUFFER_OVERFLOW},
 	{"NT_STATUS_IN_PAGE_ERROR", NT_STATUS_IN_PAGE_ERROR},
 	{"NT_STATUS_PAGEFILE_QUOTA", NT_STATUS_PAGEFILE_QUOTA},
 	{"NT_STATUS_INVALID_HANDLE", NT_STATUS_INVALID_HANDLE},
@@ -681,7 +681,7 @@ const struct nt_err_code_struct nt_errs[] = {
 	 NT_STATUS_QUOTA_LIST_INCONSISTENT},
 	{"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE},
 	{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
-	{"STATUS_MORE_ENTRIES", STATUS_MORE_ENTRIES},
-	{"STATUS_SOME_UNMAPPED", STATUS_SOME_UNMAPPED},
+	{"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES},
+	{"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED},
 	{NULL, 0}
 };
diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
index 257267367d4..7a0eae5ae7c 100644
--- a/fs/cifs/nterr.h
+++ b/fs/cifs/nterr.h
@@ -35,18 +35,20 @@ struct nt_err_code_struct {
 extern const struct nt_err_code_struct nt_errs[];
 
 /* Win32 Status codes. */
-#define STATUS_MORE_ENTRIES               0x0105
-#define ERROR_INVALID_PARAMETER		  0x0057
-#define ERROR_INSUFFICIENT_BUFFER	  0x007a
-#define STATUS_1804	                  0x070c
-#define STATUS_NOTIFY_ENUM_DIR            0x010c
+#define NT_STATUS_MORE_ENTRIES         0x0105
+#define NT_ERROR_INVALID_PARAMETER     0x0057
+#define NT_ERROR_INSUFFICIENT_BUFFER   0x007a
+#define NT_STATUS_1804                 0x070c
+#define NT_STATUS_NOTIFY_ENUM_DIR      0x010c
 
-/* Win32 Error codes extracted using a loop in smbclient then printing a
-   netmon sniff to a file. */
+/*
+ * Win32 Error codes extracted using a loop in smbclient then printing a netmon
+ * sniff to a file.
+ */
 
-#define NT_STATUS_OK 0x0000
-#define STATUS_SOME_UNMAPPED       0x0107
-#define STATUS_BUFFER_OVERFLOW     0x80000005
+#define NT_STATUS_OK                   0x0000
+#define NT_STATUS_SOME_UNMAPPED        0x0107
+#define NT_STATUS_BUFFER_OVERFLOW  0x80000005
 #define NT_STATUS_NO_MORE_ENTRIES  0x8000001a
 #define NT_STATUS_MEDIA_CHANGED    0x8000001c
 #define NT_STATUS_END_OF_MEDIA     0x8000001e
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 5d52e4a3b1e..848249fa120 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -126,3 +126,13 @@ typedef struct _AUTHENTICATE_MESSAGE {
 	   do not set the version is present flag */
 	char UserString[0];
 } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
+
+/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+
+int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses);
+void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses);
+int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen,
+			struct cifs_ses *ses,
+			const struct nls_table *nls_cp);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f8e4cd2a791..b15862e0f68 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -4,6 +4,7 @@
  *   Directory search handling
  *
  *   Copyright (C) International Business Machines  Corp., 2004, 2008
+ *   Copyright (C) Red Hat, Inc., 2011
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -47,15 +48,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 	if (file) {
 		cf = file->private_data;
 		if (cf == NULL) {
-			cFYI(1, "empty cifs private file data");
+			cifs_dbg(FYI, "empty cifs private file data\n");
 			return;
 		}
 		if (cf->invalidHandle)
-			cFYI(1, "invalid handle");
+			cifs_dbg(FYI, "invalid handle\n");
 		if (cf->srch_inf.endOfSearch)
-			cFYI(1, "end of search");
+			cifs_dbg(FYI, "end of search\n");
 		if (cf->srch_inf.emptyDir)
-			cFYI(1, "empty dir");
+			cifs_dbg(FYI, "empty dir\n");
 	}
 }
 #else
@@ -65,52 +66,72 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 #endif /* DEBUG2 */
 
 /*
+ * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
+ *
  * Find the dentry that matches "name". If there isn't one, create one. If it's
  * a negative dentry or the uniqueid changed, then drop it and recreate it.
  */
-static struct dentry *
-cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+static void
+cifs_prime_dcache(struct dentry *parent, struct qstr *name,
 		    struct cifs_fattr *fattr)
 {
 	struct dentry *dentry, *alias;
 	struct inode *inode;
 	struct super_block *sb = parent->d_inode->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, "For %s", name->name);
+	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
 
-	if (parent->d_op && parent->d_op->d_hash)
-		parent->d_op->d_hash(parent, parent->d_inode, name);
-	else
-		name->hash = full_name_hash(name->name, name->len);
+	dentry = d_hash_and_lookup(parent, name);
+	if (unlikely(IS_ERR(dentry)))
+		return;
 
-	dentry = d_lookup(parent, name);
 	if (dentry) {
-		/* FIXME: check for inode number changes? */
-		if (dentry->d_inode != NULL)
-			return dentry;
-		d_drop(dentry);
+		int err;
+
+		inode = dentry->d_inode;
+		if (inode) {
+			/*
+			 * If we're generating inode numbers, then we don't
+			 * want to clobber the existing one with the one that
+			 * the readdir code created.
+			 */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+				fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
+
+			/* update inode in place if i_ino didn't change */
+			if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+				cifs_fattr_to_inode(inode, fattr);
+				goto out;
+			}
+		}
+		err = d_invalidate(dentry);
 		dput(dentry);
+		if (err)
+			return;
 	}
 
+	/*
+	 * If we know that the inode will need to be revalidated immediately,
+	 * then don't create a new dentry for it. We'll end up doing an on
+	 * the wire call either way and this spares us an invalidation.
+	 */
+	if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+		return;
+
 	dentry = d_alloc(parent, name);
-	if (dentry == NULL)
-		return NULL;
+	if (!dentry)
+		return;
 
 	inode = cifs_iget(sb, fattr);
-	if (!inode) {
-		dput(dentry);
-		return NULL;
-	}
+	if (!inode)
+		goto out;
 
 	alias = d_materialise_unique(dentry, inode);
-	if (alias != NULL) {
-		dput(dentry);
-		if (IS_ERR(alias))
-			return NULL;
-		dentry = alias;
-	}
-
-	return dentry;
+	if (alias && !IS_ERR(alias))
+		dput(alias);
+out:
+	dput(dentry);
 }
 
 static void
@@ -127,9 +148,30 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 		fattr->cf_dtype = DT_REG;
 	}
 
+	/*
+	 * We need to revalidate it further to make a decision about whether it
+	 * is a symbolic link, DFS referral or a reparse point with a direct
+	 * access like junctions, deduplicated files, NFS symlinks.
+	 */
+	if (fattr->cf_cifsattrs & ATTR_REPARSE)
+		fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+	/* non-unix readdir doesn't provide nlink */
+	fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+
 	if (fattr->cf_cifsattrs & ATTR_READONLY)
 		fattr->cf_mode &= ~S_IWUGO;
 
+	/*
+	 * We of course don't get ACL info in FIND_FIRST/NEXT results, so
+	 * mark it for revalidation so that "ls -l" will look right. It might
+	 * be super-slow, but if we don't do this then the ownership of files
+	 * may look wrong since the inodes may not have timed out by the time
+	 * "ls" does a stat() call on them.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+		fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
 	    fattr->cf_cifsattrs & ATTR_SYSTEM) {
 		if (fattr->cf_eof == 0)  {
@@ -147,7 +189,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 	}
 }
 
-static void
+void
 cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
 		       struct cifs_sb_info *cifs_sb)
 {
@@ -189,13 +231,13 @@ cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
       we try to do FindFirst on (NTFS) directory symlinks */
 /*
 int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
-			     int xid)
+			     unsigned int xid)
 {
 	__u16 fid;
 	int len;
 	int oplock = 0;
 	int rc;
-	struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
+	struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
 	char *tmpbuffer;
 
 	rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -210,20 +252,23 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 				fid,
 				cifs_sb->local_nls);
 		if (CIFSSMBClose(xid, ptcon, fid)) {
-			cFYI(1, "Error closing temporary reparsepoint open");
+			cifs_dbg(FYI, "Error closing temporary reparsepoint open\n");
 		}
 	}
 }
  */
 
-static int initiate_cifs_search(const int xid, struct file *file)
+static int
+initiate_cifs_search(const unsigned int xid, struct file *file)
 {
+	__u16 search_flags;
 	int rc = 0;
 	char *full_path = NULL;
 	struct cifsFileInfo *cifsFile;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	struct tcon_link *tlink = NULL;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 
 	if (file->private_data == NULL) {
 		tlink = cifs_sb_tlink(cifs_sb);
@@ -237,10 +282,17 @@ static int initiate_cifs_search(const int xid, struct file *file)
 		}
 		file->private_data = cifsFile;
 		cifsFile->tlink = cifs_get_tlink(tlink);
-		pTcon = tlink_tcon(tlink);
+		tcon = tlink_tcon(tlink);
 	} else {
 		cifsFile = file->private_data;
-		pTcon = tlink_tcon(cifsFile->tlink);
+		tcon = tlink_tcon(cifsFile->tlink);
+	}
+
+	server = tcon->ses->server;
+
+	if (!server->ops->query_dir_first) {
+		rc = -ENOSYS;
+		goto error_exit;
 	}
 
 	cifsFile->invalidHandle = true;
@@ -252,16 +304,16 @@ static int initiate_cifs_search(const int xid, struct file *file)
 		goto error_exit;
 	}
 
-	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
+	cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos);
 
 ffirst_retry:
 	/* test for Unix extensions */
 	/* but now check for them on the share/mount not on the SMB session */
-/*	if (pTcon->ses->capabilities & CAP_UNIX) { */
-	if (pTcon->unix_ext)
+	/* if (cap_unix(tcon->ses) { */
+	if (tcon->unix_ext)
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
-	else if ((pTcon->ses->capabilities &
-			(CAP_NT_SMBS | CAP_NT_FIND)) == 0) {
+	else if ((tcon->ses->capabilities &
+		  tcon->ses->server->vals->cap_nt_find) == 0) {
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
 	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
@@ -269,10 +321,14 @@ ffirst_retry:
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
 	}
 
-	rc = CIFSFindFirst(xid, pTcon, full_path, cifs_sb->local_nls,
-		&cifsFile->netfid, &cifsFile->srch_inf,
-		cifs_sb->mnt_cifs_flags &
-			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
+	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
+	if (backup_cred(cifs_sb))
+		search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
+
+	rc = server->ops->query_dir_first(xid, tcon, full_path, cifs_sb,
+					  &cifsFile->fid, search_flags,
+					  &cifsFile->srch_inf);
+
 	if (rc == 0)
 		cifsFile->invalidHandle = false;
 	/* BB add following call to handle readdir on new NTFS symlink errors
@@ -290,16 +346,16 @@ error_exit:
 }
 
 /* return length of unicode string in bytes */
-static int cifs_unicode_bytelen(char *str)
+static int cifs_unicode_bytelen(const char *str)
 {
 	int len;
-	__le16 *ustr = (__le16 *)str;
+	const __le16 *ustr = (const __le16 *)str;
 
 	for (len = 0; len <= PATH_MAX; len++) {
 		if (ustr[len] == 0)
 			return len << 1;
 	}
-	cFYI(1, "Unicode string longer than PATH_MAX found");
+	cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n");
 	return len << 1;
 }
 
@@ -316,96 +372,146 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 				pfData->FileNameLength;
 	} else
 		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-	cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
+	cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry);
 	/* validate that new_entry is not past end of SMB */
 	if (new_entry >= end_of_smb) {
-		cERROR(1, "search entry %p began after end of SMB %p old entry %p",
-			new_entry, end_of_smb, old_entry);
+		cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n",
+			 new_entry, end_of_smb, old_entry);
 		return NULL;
 	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
 		    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
 		  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-		cERROR(1, "search entry %p extends after end of SMB %p",
-			new_entry, end_of_smb);
+		cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n",
+			 new_entry, end_of_smb);
 		return NULL;
 	} else
 		return new_entry;
 
 }
 
+struct cifs_dirent {
+	const char	*name;
+	size_t		namelen;
+	u32		resume_key;
+	u64		ino;
+};
+
+static void cifs_fill_dirent_unix(struct cifs_dirent *de,
+		const FILE_UNIX_INFO *info, bool is_unicode)
+{
+	de->name = &info->FileName[0];
+	if (is_unicode)
+		de->namelen = cifs_unicode_bytelen(de->name);
+	else
+		de->namelen = strnlen(de->name, PATH_MAX);
+	de->resume_key = info->ResumeKey;
+	de->ino = le64_to_cpu(info->basic.UniqueId);
+}
+
+static void cifs_fill_dirent_dir(struct cifs_dirent *de,
+		const FILE_DIRECTORY_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+}
+
+static void cifs_fill_dirent_full(struct cifs_dirent *de,
+		const FILE_FULL_DIRECTORY_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+}
+
+static void cifs_fill_dirent_search(struct cifs_dirent *de,
+		const SEARCH_ID_FULL_DIR_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+	de->ino = le64_to_cpu(info->UniqueId);
+}
+
+static void cifs_fill_dirent_both(struct cifs_dirent *de,
+		const FILE_BOTH_DIRECTORY_INFO *info)
+{
+	de->name = &info->FileName[0];
+	de->namelen = le32_to_cpu(info->FileNameLength);
+	de->resume_key = info->FileIndex;
+}
+
+static void cifs_fill_dirent_std(struct cifs_dirent *de,
+		const FIND_FILE_STANDARD_INFO *info)
+{
+	de->name = &info->FileName[0];
+	/* one byte length, no endianess conversion */
+	de->namelen = info->FileNameLength;
+	de->resume_key = info->ResumeKey;
+}
+
+static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
+		u16 level, bool is_unicode)
+{
+	memset(de, 0, sizeof(*de));
+
+	switch (level) {
+	case SMB_FIND_FILE_UNIX:
+		cifs_fill_dirent_unix(de, info, is_unicode);
+		break;
+	case SMB_FIND_FILE_DIRECTORY_INFO:
+		cifs_fill_dirent_dir(de, info);
+		break;
+	case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+		cifs_fill_dirent_full(de, info);
+		break;
+	case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+		cifs_fill_dirent_search(de, info);
+		break;
+	case SMB_FIND_FILE_BOTH_DIRECTORY_INFO:
+		cifs_fill_dirent_both(de, info);
+		break;
+	case SMB_FIND_FILE_INFO_STANDARD:
+		cifs_fill_dirent_std(de, info);
+		break;
+	default:
+		cifs_dbg(FYI, "Unknown findfirst level %d\n", level);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 #define UNICODE_DOT cpu_to_le16(0x2e)
 
 /* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */
-static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
+static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
 {
 	int rc = 0;
-	char *filename = NULL;
-	int len = 0;
-
-	if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
-		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		if (cfile->srch_inf.unicode) {
-			len = cifs_unicode_bytelen(filename);
-		} else {
-			/* BB should we make this strnlen of PATH_MAX? */
-			len = strnlen(filename, 5);
-		}
-	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
-		FILE_DIRECTORY_INFO *pFindData =
-			(FILE_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (cfile->srch_inf.info_level ==
-			SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-		FILE_FULL_DIRECTORY_INFO *pFindData =
-			(FILE_FULL_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (cfile->srch_inf.info_level ==
-			SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-		SEARCH_ID_FULL_DIR_INFO *pFindData =
-			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (cfile->srch_inf.info_level ==
-			SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-		FILE_BOTH_DIRECTORY_INFO *pFindData =
-			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
-		FIND_FILE_STANDARD_INFO *pFindData =
-			(FIND_FILE_STANDARD_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = pFindData->FileNameLength;
-	} else {
-		cFYI(1, "Unknown findfirst level %d",
-			 cfile->srch_inf.info_level);
-	}
 
-	if (filename) {
-		if (cfile->srch_inf.unicode) {
-			__le16 *ufilename = (__le16 *)filename;
-			if (len == 2) {
-				/* check for . */
-				if (ufilename[0] == UNICODE_DOT)
-					rc = 1;
-			} else if (len == 4) {
-				/* check for .. */
-				if ((ufilename[0] == UNICODE_DOT)
-				   && (ufilename[1] == UNICODE_DOT))
-					rc = 2;
-			}
-		} else /* ASCII */ {
-			if (len == 1) {
-				if (filename[0] == '.')
-					rc = 1;
-			} else if (len == 2) {
-				if ((filename[0] == '.') && (filename[1] == '.'))
-					rc = 2;
-			}
+	if (!de->name)
+		return 0;
+
+	if (is_unicode) {
+		__le16 *ufilename = (__le16 *)de->name;
+		if (de->namelen == 2) {
+			/* check for . */
+			if (ufilename[0] == UNICODE_DOT)
+				rc = 1;
+		} else if (de->namelen == 4) {
+			/* check for .. */
+			if (ufilename[0] == UNICODE_DOT &&
+			    ufilename[1] == UNICODE_DOT)
+				rc = 2;
+		}
+	} else /* ASCII */ {
+		if (de->namelen == 1) {
+			if (de->name[0] == '.')
+				rc = 1;
+		} else if (de->namelen == 2) {
+			if (de->name[0] == '.' && de->name[1] == '.')
+				rc = 2;
 		}
 	}
 
@@ -416,7 +522,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
    whether we can use the cached search results from the previous search */
 static int is_dir_changed(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
 
 	if (cifsInfo->time == 0)
@@ -427,321 +533,223 @@ static int is_dir_changed(struct file *file)
 }
 
 static int cifs_save_resume_key(const char *current_entry,
-	struct cifsFileInfo *cifsFile)
+	struct cifsFileInfo *file_info)
 {
-	int rc = 0;
-	unsigned int len = 0;
-	__u16 level;
-	char *filename;
-
-	if ((cifsFile == NULL) || (current_entry == NULL))
-		return -EINVAL;
-
-	level = cifsFile->srch_inf.info_level;
-
-	if (level == SMB_FIND_FILE_UNIX) {
-		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+	struct cifs_dirent de;
+	int rc;
 
-		filename = &pFindData->FileName[0];
-		if (cifsFile->srch_inf.unicode) {
-			len = cifs_unicode_bytelen(filename);
-		} else {
-			/* BB should we make this strnlen of PATH_MAX? */
-			len = strnlen(filename, PATH_MAX);
-		}
-		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
-		FILE_DIRECTORY_INFO *pFindData =
-			(FILE_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-		FILE_FULL_DIRECTORY_INFO *pFindData =
-			(FILE_FULL_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-		SEARCH_ID_FULL_DIR_INFO *pFindData =
-			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-		FILE_BOTH_DIRECTORY_INFO *pFindData =
-			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-		FIND_FILE_STANDARD_INFO *pFindData =
-			(FIND_FILE_STANDARD_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		/* one byte length, no name conversion */
-		len = (unsigned int)pFindData->FileNameLength;
-		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-	} else {
-		cFYI(1, "Unknown findfirst level %d", level);
-		return -EINVAL;
+	rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level,
+			      file_info->srch_inf.unicode);
+	if (!rc) {
+		file_info->srch_inf.presume_name = de.name;
+		file_info->srch_inf.resume_name_len = de.namelen;
+		file_info->srch_inf.resume_key = de.resume_key;
 	}
-	cifsFile->srch_inf.resume_name_len = len;
-	cifsFile->srch_inf.presume_name = filename;
 	return rc;
 }
 
-/* find the corresponding entry in the search */
-/* Note that the SMB server returns search entries for . and .. which
-   complicates logic here if we choose to parse for them and we do not
-   assume that they are located in the findfirst return buffer.*/
-/* We start counting in the buffer with entry 2 and increment for every
-   entry (do not increment for . or .. entry) */
-static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
-	struct file *file, char **ppCurrentEntry, int *num_to_ret)
+/*
+ * Find the corresponding entry in the search. Note that the SMB server returns
+ * search entries for . and .. which complicates logic here if we choose to
+ * parse for them and we do not assume that they are located in the findfirst
+ * return buffer. We start counting in the buffer with entry 2 and increment for
+ * every entry (do not increment for . or .. entry).
+ */
+static int
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
+		struct file *file, char **current_entry, int *num_to_ret)
 {
+	__u16 search_flags;
 	int rc = 0;
 	int pos_in_buf = 0;
 	loff_t first_entry_in_buffer;
-	loff_t index_to_find = file->f_pos;
-	struct cifsFileInfo *cifsFile = file->private_data;
+	loff_t index_to_find = pos;
+	struct cifsFileInfo *cfile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	struct TCP_Server_Info *server = tcon->ses->server;
 	/* check if index in the buffer */
 
-	if ((cifsFile == NULL) || (ppCurrentEntry == NULL) ||
-	   (num_to_ret == NULL))
+	if (!server->ops->query_dir_first || !server->ops->query_dir_next)
+		return -ENOSYS;
+
+	if ((cfile == NULL) || (current_entry == NULL) || (num_to_ret == NULL))
 		return -ENOENT;
 
-	*ppCurrentEntry = NULL;
-	first_entry_in_buffer =
-		cifsFile->srch_inf.index_of_last_entry -
-			cifsFile->srch_inf.entries_in_buffer;
+	*current_entry = NULL;
+	first_entry_in_buffer = cfile->srch_inf.index_of_last_entry -
+					cfile->srch_inf.entries_in_buffer;
 
-	/* if first entry in buf is zero then is first buffer
-	in search response data which means it is likely . and ..
-	will be in this buffer, although some servers do not return
-	. and .. for the root of a drive and for those we need
-	to start two entries earlier */
+	/*
+	 * If first entry in buf is zero then is first buffer
+	 * in search response data which means it is likely . and ..
+	 * will be in this buffer, although some servers do not return
+	 * . and .. for the root of a drive and for those we need
+	 * to start two entries earlier.
+	 */
 
 	dump_cifs_file_struct(file, "In fce ");
-	if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) &&
-	     is_dir_changed(file)) ||
-	   (index_to_find < first_entry_in_buffer)) {
+	if (((index_to_find < cfile->srch_inf.index_of_last_entry) &&
+	     is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
-		cFYI(1, "search backing up - close and restart search");
+		cifs_dbg(FYI, "search backing up - close and restart search\n");
 		spin_lock(&cifs_file_list_lock);
-		if (!cifsFile->srch_inf.endOfSearch &&
-		    !cifsFile->invalidHandle) {
-			cifsFile->invalidHandle = true;
+		if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+			cfile->invalidHandle = true;
 			spin_unlock(&cifs_file_list_lock);
-			CIFSFindClose(xid, pTcon, cifsFile->netfid);
+			if (server->ops->close)
+				server->ops->close(xid, tcon, &cfile->fid);
 		} else
 			spin_unlock(&cifs_file_list_lock);
-		if (cifsFile->srch_inf.ntwrk_buf_start) {
-			cFYI(1, "freeing SMB ff cache buf on search rewind");
-			if (cifsFile->srch_inf.smallBuf)
-				cifs_small_buf_release(cifsFile->srch_inf.
+		if (cfile->srch_inf.ntwrk_buf_start) {
+			cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n");
+			if (cfile->srch_inf.smallBuf)
+				cifs_small_buf_release(cfile->srch_inf.
 						ntwrk_buf_start);
 			else
-				cifs_buf_release(cifsFile->srch_inf.
+				cifs_buf_release(cfile->srch_inf.
 						ntwrk_buf_start);
-			cifsFile->srch_inf.ntwrk_buf_start = NULL;
+			cfile->srch_inf.ntwrk_buf_start = NULL;
 		}
 		rc = initiate_cifs_search(xid, file);
 		if (rc) {
-			cFYI(1, "error %d reinitiating a search on rewind",
+			cifs_dbg(FYI, "error %d reinitiating a search on rewind\n",
 				 rc);
 			return rc;
 		}
-		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+		/* FindFirst/Next set last_entry to NULL on malformed reply */
+		if (cfile->srch_inf.last_entry)
+			cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
 	}
 
-	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
-	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
-		cFYI(1, "calling findnext2");
-		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
-				  &cifsFile->srch_inf);
-		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
+	if (backup_cred(cifs_sb))
+		search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
+
+	while ((index_to_find >= cfile->srch_inf.index_of_last_entry) &&
+	       (rc == 0) && !cfile->srch_inf.endOfSearch) {
+		cifs_dbg(FYI, "calling findnext2\n");
+		rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
+						 search_flags,
+						 &cfile->srch_inf);
+		/* FindFirst/Next set last_entry to NULL on malformed reply */
+		if (cfile->srch_inf.last_entry)
+			cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
 		if (rc)
 			return -ENOENT;
 	}
-	if (index_to_find < cifsFile->srch_inf.index_of_last_entry) {
+	if (index_to_find < cfile->srch_inf.index_of_last_entry) {
 		/* we found the buffer that contains the entry */
 		/* scan and find it */
 		int i;
-		char *current_entry;
-		char *end_of_smb = cifsFile->srch_inf.ntwrk_buf_start +
-			smbCalcSize((struct smb_hdr *)
-				cifsFile->srch_inf.ntwrk_buf_start);
-
-		current_entry = cifsFile->srch_inf.srch_entries_start;
-		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
-					- cifsFile->srch_inf.entries_in_buffer;
+		char *cur_ent;
+		char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
+			server->ops->calc_smb_size(
+					cfile->srch_inf.ntwrk_buf_start);
+
+		cur_ent = cfile->srch_inf.srch_entries_start;
+		first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
+					- cfile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
-		cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
+		cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf);
 
-		for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
+		for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) {
 			/* go entry by entry figuring out which is first */
-			current_entry = nxt_dir_entry(current_entry, end_of_smb,
-						cifsFile->srch_inf.info_level);
+			cur_ent = nxt_dir_entry(cur_ent, end_of_smb,
+						cfile->srch_inf.info_level);
 		}
-		if ((current_entry == NULL) && (i < pos_in_buf)) {
+		if ((cur_ent == NULL) && (i < pos_in_buf)) {
 			/* BB fixme - check if we should flag this error */
-			cERROR(1, "reached end of buf searching for pos in buf"
-			  " %d index to find %lld rc %d",
-			  pos_in_buf, index_to_find, rc);
+			cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n",
+				 pos_in_buf, index_to_find, rc);
 		}
 		rc = 0;
-		*ppCurrentEntry = current_entry;
+		*current_entry = cur_ent;
 	} else {
-		cFYI(1, "index not in buffer - could not findnext into it");
+		cifs_dbg(FYI, "index not in buffer - could not findnext into it\n");
 		return 0;
 	}
 
-	if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
-		cFYI(1, "can not return entries pos_in_buf beyond last");
+	if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) {
+		cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n");
 		*num_to_ret = 0;
 	} else
-		*num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
+		*num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf;
 
 	return rc;
 }
 
-/* inode num, inode type and filename returned */
-static int cifs_get_name_from_search_buf(struct qstr *pqst,
-	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
+static int cifs_filldir(char *find_entry, struct file *file,
+		struct dir_context *ctx,
+		char *scratch_buf, unsigned int max_len)
 {
+	struct cifsFileInfo *file_info = file->private_data;
+	struct super_block *sb = file->f_path.dentry->d_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_dirent de = { NULL, };
+	struct cifs_fattr fattr;
+	struct qstr name;
 	int rc = 0;
-	unsigned int len = 0;
-	char *filename;
-	struct nls_table *nlt = cifs_sb->local_nls;
-
-	*pinum = 0;
-
-	if (level == SMB_FIND_FILE_UNIX) {
-		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+	ino_t ino;
 
-		filename = &pFindData->FileName[0];
-		if (unicode) {
-			len = cifs_unicode_bytelen(filename);
-		} else {
-			/* BB should we make this strnlen of PATH_MAX? */
-			len = strnlen(filename, PATH_MAX);
-		}
-
-		*pinum = le64_to_cpu(pFindData->basic.UniqueId);
-	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
-		FILE_DIRECTORY_INFO *pFindData =
-			(FILE_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-		FILE_FULL_DIRECTORY_INFO *pFindData =
-			(FILE_FULL_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-		SEARCH_ID_FULL_DIR_INFO *pFindData =
-			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		*pinum = le64_to_cpu(pFindData->UniqueId);
-	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-		FILE_BOTH_DIRECTORY_INFO *pFindData =
-			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-		FIND_FILE_STANDARD_INFO *pFindData =
-			(FIND_FILE_STANDARD_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		/* one byte length, no name conversion */
-		len = (unsigned int)pFindData->FileNameLength;
-	} else {
-		cFYI(1, "Unknown findfirst level %d", level);
-		return -EINVAL;
-	}
+	rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level,
+			      file_info->srch_inf.unicode);
+	if (rc)
+		return rc;
 
-	if (len > max_len) {
-		cERROR(1, "bad search response length %d past smb end", len);
+	if (de.namelen > max_len) {
+		cifs_dbg(VFS, "bad search response length %zd past smb end\n",
+			 de.namelen);
 		return -EINVAL;
 	}
 
-	if (unicode) {
-		pqst->len = cifs_from_ucs2((char *) pqst->name,
-					   (__le16 *) filename,
-					   UNICODE_NAME_MAX,
-					   min(len, max_len), nlt,
-					   cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-		pqst->len -= nls_nullsize(nlt);
-	} else {
-		pqst->name = filename;
-		pqst->len = len;
-	}
-	return rc;
-}
-
-static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
-			void *direntry, char *scratch_buf, unsigned int max_len)
-{
-	int rc = 0;
-	struct qstr qstring;
-	struct cifsFileInfo *pCifsF;
-	u64    inum;
-	ino_t  ino;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
-	struct dentry *tmp_dentry;
-	struct cifs_fattr fattr;
-
-	/* get filename and len into qstring */
-	/* get dentry */
-	/* decide whether to create and populate ionde */
-	if ((direntry == NULL) || (file == NULL))
-		return -EINVAL;
-
-	pCifsF = file->private_data;
-
-	if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
-		return -ENOENT;
-
-	rc = cifs_entry_is_dot(pfindEntry, pCifsF);
 	/* skip . and .. since we added them first */
-	if (rc != 0)
+	if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode))
 		return 0;
 
-	sb = file->f_path.dentry->d_sb;
-	cifs_sb = CIFS_SB(sb);
-
-	qstring.name = scratch_buf;
-	rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
-			pCifsF->srch_inf.info_level,
-			pCifsF->srch_inf.unicode, cifs_sb,
-			max_len, &inum /* returned */);
+	if (file_info->srch_inf.unicode) {
+		struct nls_table *nlt = cifs_sb->local_nls;
 
-	if (rc)
-		return rc;
+		name.name = scratch_buf;
+		name.len =
+			cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+					UNICODE_NAME_MAX,
+					min_t(size_t, de.namelen,
+					      (size_t)max_len), nlt,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		name.len -= nls_nullsize(nlt);
+	} else {
+		name.name = de.name;
+		name.len = de.namelen;
+	}
 
-	if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
+	switch (file_info->srch_inf.info_level) {
+	case SMB_FIND_FILE_UNIX:
 		cifs_unix_basic_to_fattr(&fattr,
-				 &((FILE_UNIX_INFO *) pfindEntry)->basic,
-				 cifs_sb);
-	else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
-		cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *)
-					pfindEntry, cifs_sb);
-	else
-		cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
-					pfindEntry, cifs_sb);
+					 &((FILE_UNIX_INFO *)find_entry)->basic,
+					 cifs_sb);
+		break;
+	case SMB_FIND_FILE_INFO_STANDARD:
+		cifs_std_info_to_fattr(&fattr,
+				       (FIND_FILE_STANDARD_INFO *)find_entry,
+				       cifs_sb);
+		break;
+	default:
+		cifs_dir_info_to_fattr(&fattr,
+				       (FILE_DIRECTORY_INFO *)find_entry,
+				       cifs_sb);
+		break;
+	}
 
-	if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
-		fattr.cf_uniqueid = inum;
+	if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+		fattr.cf_uniqueid = de.ino;
 	} else {
 		fattr.cf_uniqueid = iunique(sb, ROOT_I);
 		cifs_autodisable_serverino(cifs_sb);
 	}
 
 	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
-	    CIFSCouldBeMFSymlink(&fattr))
+	    couldbe_mf_symlink(&fattr))
 		/*
 		 * trying to get the type and mode can be slow,
 		 * so just call those regular files for now, and mark
@@ -749,22 +757,19 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
 		 */
 		fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
 
-	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
-	tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
-
-	rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
-		     ino, fattr.cf_dtype);
+	cifs_prime_dcache(file->f_dentry, &name, &fattr);
 
-	dput(tmp_dentry);
-	return rc;
+	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+	return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
 }
 
 
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int rc = 0;
-	int xid, i;
-	struct cifsTconInfo *pTcon;
+	unsigned int xid;
+	int i;
+	struct cifs_tcon *tcon;
 	struct cifsFileInfo *cifsFile = NULL;
 	char *current_entry;
 	int num_to_fill = 0;
@@ -772,7 +777,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	char *end_of_smb;
 	unsigned int max_len;
 
-	xid = GetXid();
+	xid = get_xid();
 
 	/*
 	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
@@ -780,108 +785,93 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	 */
 	if (file->private_data == NULL) {
 		rc = initiate_cifs_search(xid, file);
-		cFYI(1, "initiate cifs search rc %d", rc);
+		cifs_dbg(FYI, "initiate cifs search rc %d\n", rc);
 		if (rc)
 			goto rddir2_exit;
 	}
 
-	switch ((int) file->f_pos) {
-	case 0:
-		if (filldir(direntry, ".", 1, file->f_pos,
-		     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, "Filldir for current dir failed");
-			rc = -ENOMEM;
-			break;
+	if (!dir_emit_dots(file, ctx))
+		goto rddir2_exit;
+
+	/* 1) If search is active,
+		is in current search buffer?
+		if it before then restart search
+		if after then keep searching till find it */
+
+	if (file->private_data == NULL) {
+		rc = -EINVAL;
+		goto rddir2_exit;
+	}
+	cifsFile = file->private_data;
+	if (cifsFile->srch_inf.endOfSearch) {
+		if (cifsFile->srch_inf.emptyDir) {
+			cifs_dbg(FYI, "End of search, empty dir\n");
+			rc = 0;
+			goto rddir2_exit;
 		}
-		file->f_pos++;
-	case 1:
-		if (filldir(direntry, "..", 2, file->f_pos,
-		     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, "Filldir for parent dir failed");
-			rc = -ENOMEM;
+	} /* else {
+		cifsFile->invalidHandle = true;
+		tcon->ses->server->close(xid, tcon, &cifsFile->fid);
+	} */
+
+	tcon = tlink_tcon(cifsFile->tlink);
+	rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
+			     &num_to_fill);
+	if (rc) {
+		cifs_dbg(FYI, "fce error %d\n", rc);
+		goto rddir2_exit;
+	} else if (current_entry != NULL) {
+		cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+	} else {
+		cifs_dbg(FYI, "could not find entry\n");
+		goto rddir2_exit;
+	}
+	cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+		 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+	max_len = tcon->ses->server->ops->calc_smb_size(
+			cifsFile->srch_inf.ntwrk_buf_start);
+	end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+	tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+	if (tmp_buf == NULL) {
+		rc = -ENOMEM;
+		goto rddir2_exit;
+	}
+
+	for (i = 0; i < num_to_fill; i++) {
+		if (current_entry == NULL) {
+			/* evaluate whether this case is an error */
+			cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+				 num_to_fill, i);
 			break;
 		}
-		file->f_pos++;
-	default:
-		/* 1) If search is active,
-			is in current search buffer?
-			if it before then restart search
-			if after then keep searching till find it */
-
-		if (file->private_data == NULL) {
-			rc = -EINVAL;
-			FreeXid(xid);
-			return rc;
-		}
-		cifsFile = file->private_data;
-		if (cifsFile->srch_inf.endOfSearch) {
-			if (cifsFile->srch_inf.emptyDir) {
-				cFYI(1, "End of search, empty dir");
-				rc = 0;
-				break;
-			}
-		} /* else {
-			cifsFile->invalidHandle = true;
-			CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		} */
-
-		pTcon = tlink_tcon(cifsFile->tlink);
-		rc = find_cifs_entry(xid, pTcon, file,
-				&current_entry, &num_to_fill);
+		/*
+		 * if buggy server returns . and .. late do we want to
+		 * check for that here?
+		 */
+		rc = cifs_filldir(current_entry, file, ctx,
+				  tmp_buf, max_len);
 		if (rc) {
-			cFYI(1, "fce error %d", rc);
-			goto rddir2_exit;
-		} else if (current_entry != NULL) {
-			cFYI(1, "entry %lld found", file->f_pos);
-		} else {
-			cFYI(1, "could not find entry");
-			goto rddir2_exit;
-		}
-		cFYI(1, "loop through %d times filling dir for net buf %p",
-			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
-		max_len = smbCalcSize((struct smb_hdr *)
-				cifsFile->srch_inf.ntwrk_buf_start);
-		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
-
-		tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
-		if (tmp_buf == NULL) {
-			rc = -ENOMEM;
+			if (rc > 0)
+				rc = 0;
 			break;
 		}
 
-		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
-			if (current_entry == NULL) {
-				/* evaluate whether this case is an error */
-				cERROR(1, "past SMB end,  num to fill %d i %d",
-					  num_to_fill, i);
-				break;
-			}
-			/* if buggy server returns . and .. late do
-			we want to check for that here? */
-			rc = cifs_filldir(current_entry, file,
-					filldir, direntry, tmp_buf, max_len);
-			if (rc == -EOVERFLOW) {
-				rc = 0;
-				break;
-			}
-
-			file->f_pos++;
-			if (file->f_pos ==
-				cifsFile->srch_inf.index_of_last_entry) {
-				cFYI(1, "last entry in buf at pos %lld %s",
-					file->f_pos, tmp_buf);
-				cifs_save_resume_key(current_entry, cifsFile);
-				break;
-			} else
-				current_entry =
-					nxt_dir_entry(current_entry, end_of_smb,
-						cifsFile->srch_inf.info_level);
-		}
-		kfree(tmp_buf);
-		break;
-	} /* end switch */
+		ctx->pos++;
+		if (ctx->pos ==
+			cifsFile->srch_inf.index_of_last_entry) {
+			cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+				 ctx->pos, tmp_buf);
+			cifs_save_resume_key(current_entry, cifsFile);
+			break;
+		} else
+			current_entry =
+				nxt_dir_entry(current_entry, end_of_smb,
+					cifsFile->srch_inf.info_level);
+	}
+	kfree(tmp_buf);
 
 rddir2_exit:
-	FreeXid(xid);
+	free_xid(xid);
 	return rc;
 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 16765703131..e87387dbf39 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,89 +32,7 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
 
-/*
- * Checks if this is the first smb session to be reconnected after
- * the socket has been reestablished (so we know whether to use vc 0).
- * Called while holding the cifs_tcp_ses_lock, so do not block
- */
-static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
-{
-	struct list_head *tmp;
-	struct cifsSesInfo *tmp_ses;
-
-	list_for_each(tmp, &ses->server->smb_ses_list) {
-		tmp_ses = list_entry(tmp, struct cifsSesInfo,
-				     smb_ses_list);
-		if (tmp_ses->need_reconnect == false)
-			return false;
-	}
-	/* could not find a session that was already connected,
-	   this must be the first one we are reconnecting */
-	return true;
-}
-
-/*
- *	vc number 0 is treated specially by some servers, and should be the
- *      first one we request.  After that we can use vcnumbers up to maxvcs,
- *	one for each smb session (some Windows versions set maxvcs incorrectly
- *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
- *	any vc but zero (some servers reset the connection on vcnum zero)
- *
- */
-static __le16 get_next_vcnum(struct cifsSesInfo *ses)
-{
-	__u16 vcnum = 0;
-	struct list_head *tmp;
-	struct cifsSesInfo *tmp_ses;
-	__u16 max_vcs = ses->server->max_vcs;
-	__u16 i;
-	int free_vc_found = 0;
-
-	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
-	field to one but do not enforce this limit, which allows an SMB client
-	to establish more virtual circuits than allowed by this value ... but
-	other server implementations can enforce this limit." */
-	if (max_vcs < 2)
-		max_vcs = 0xFFFF;
-
-	spin_lock(&cifs_tcp_ses_lock);
-	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
-			goto get_vc_num_exit;  /* vcnum will be zero */
-	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
-		if (i == 0) /* this is the only connection, use vc 0 */
-			break;
-
-		free_vc_found = 1;
-
-		list_for_each(tmp, &ses->server->smb_ses_list) {
-			tmp_ses = list_entry(tmp, struct cifsSesInfo,
-					     smb_ses_list);
-			if (tmp_ses->vcnum == i) {
-				free_vc_found = 0;
-				break; /* found duplicate, try next vcnum */
-			}
-		}
-		if (free_vc_found)
-			break; /* we found a vcnumber that will work - use it */
-	}
-
-	if (i == 0)
-		vcnum = 0; /* for most common case, ie if one smb session, use
-			      vc zero.  Also for case when no free vcnum, zero
-			      is safest to send (some clients only send zero) */
-	else if (free_vc_found == 0)
-		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
-				reset all uids on that, but 1 is ok. */
-	else
-		vcnum = i;
-	ses->vcnum = vcnum;
-get_vc_num_exit:
-	spin_unlock(&cifs_tcp_ses_lock);
-
-	return cpu_to_le16(vcnum);
-}
-
-static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
+static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
 
@@ -124,9 +42,11 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	/*	that we use in next few lines                               */
 	/* Note that header is initialized to zero in header_assemble */
 	pSMB->req.AndXCommand = 0xFF;
-	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+	pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
+					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
+					USHRT_MAX));
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-	pSMB->req.VcNumber = get_next_vcnum(ses);
+	pSMB->req.VcNumber = __constant_cpu_to_le16(1);
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
@@ -136,8 +56,7 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
 			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
 
-	if (ses->server->secMode &
-	    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (ses->server->sign)
 		pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	if (ses->capabilities & CAP_UNICODE) {
@@ -165,23 +84,23 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
 	int bytes_ret = 0;
 
 	/* Copy OS version */
-	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
-				  nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+				    nls_cp);
 	bcc_ptr += 2 * bytes_ret;
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
-				  32, nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+				    32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
 
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  32, nls_cp);
+	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+				    32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
 
 	*pbcc_area = bcc_ptr;
 }
 
-static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
+static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
 				   const struct nls_table *nls_cp)
 {
 	char *bcc_ptr = *pbcc_area;
@@ -195,8 +114,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
 		*(bcc_ptr+1) = 0;
 		bytes_ret = 0;
 	} else
-		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
-					  256, nls_cp);
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+					    CIFS_MAX_DOMAINNAME_LEN, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2;  /* account for null terminator */
 
@@ -204,7 +123,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifsSesInfo *ses,
 }
 
 
-static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
+static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 				   const struct nls_table *nls_cp)
 {
 	char *bcc_ptr = *pbcc_area;
@@ -219,13 +138,13 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 		bcc_ptr++;
 	} */
 	/* copy user */
-	if (ses->userName == NULL) {
+	if (ses->user_name == NULL) {
 		/* null user mount */
 		*bcc_ptr = 0;
 		*(bcc_ptr+1) = 0;
 	} else {
-		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
-					  MAX_USERNAME_SIZE, nls_cp);
+		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+					    CIFS_MAX_USERNAME_LEN, nls_cp);
 	}
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* account for null termination */
@@ -236,7 +155,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 	*pbcc_area = bcc_ptr;
 }
 
-static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
+static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 				 const struct nls_table *nls_cp)
 {
 	char *bcc_ptr = *pbcc_area;
@@ -244,20 +163,18 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 	/* copy user */
 	/* BB what about null user mounts - check that we do this BB */
 	/* copy user */
-	if (ses->userName == NULL) {
-		/* BB what about null user mounts - check that we do this BB */
-	} else {
-		strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
+	if (ses->user_name != NULL) {
+		strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN);
+		bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
 	}
-	bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
+	/* else null user mount */
 	*bcc_ptr = 0;
 	bcc_ptr++; /* account for null termination */
 
 	/* copy domain */
-
 	if (ses->domainName != NULL) {
-		strncpy(bcc_ptr, ses->domainName, 256);
-		bcc_ptr += strnlen(ses->domainName, 256);
+		strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+		bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
 	} /* else we will send a null domain name
 	     so the server will default to its own domain */
 	*bcc_ptr = 0;
@@ -277,30 +194,17 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 }
 
 static void
-decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 		      const struct nls_table *nls_cp)
 {
 	int len;
 	char *data = *pbcc_area;
 
-	cFYI(1, "bleft %d", bleft);
-
-	/*
-	 * Windows servers do not always double null terminate their final
-	 * Unicode string. Check to see if there are an uneven number of bytes
-	 * left. If so, then add an extra NULL pad byte to the end of the
-	 * response.
-	 *
-	 * See section 2.7.2 in "Implementing CIFS" for details
-	 */
-	if (bleft % 2) {
-		data[bleft] = 0;
-		++bleft;
-	}
+	cifs_dbg(FYI, "bleft %d\n", bleft);
 
 	kfree(ses->serverOS);
-	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, "serverOS=%s", ses->serverOS);
+	ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
+	cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -308,8 +212,8 @@ decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
 		return;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, "serverNOS=%s", ses->serverNOS);
+	ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
+	cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -317,42 +221,39 @@ decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
 		return;
 
 	kfree(ses->serverDomain);
-	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, "serverDomain=%s", ses->serverDomain);
+	ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
+	cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain);
 
 	return;
 }
 
-static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
-			       struct cifsSesInfo *ses,
-			       const struct nls_table *nls_cp)
+static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+				struct cifs_ses *ses,
+				const struct nls_table *nls_cp)
 {
-	int rc = 0;
 	int len;
 	char *bcc_ptr = *pbcc_area;
 
-	cFYI(1, "decode sessetup ascii. bleft %d", bleft);
+	cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft);
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len >= bleft)
-		return rc;
+		return;
 
 	kfree(ses->serverOS);
 
 	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
 	if (ses->serverOS)
 		strncpy(ses->serverOS, bcc_ptr, len);
-	if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-			cFYI(1, "OS/2 server");
-			ses->flags |= CIFS_SES_OS2;
-	}
+	if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+		cifs_dbg(FYI, "OS/2 server\n");
 
 	bcc_ptr += len + 1;
 	bleft -= len + 1;
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len >= bleft)
-		return rc;
+		return;
 
 	kfree(ses->serverNOS);
 
@@ -365,20 +266,18 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len > bleft)
-		return rc;
+		return;
 
 	/* No domain field in LANMAN case. Domain is
 	   returned by old servers in the SMB negprot response */
 	/* BB For newer servers which do not support Unicode,
 	   but thus do return domain here we could add parsing
 	   for it later, but it is not very important */
-	cFYI(1, "ascii: bytes left %d", bleft);
-
-	return rc;
+	cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
 }
 
-static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
-				    struct cifsSesInfo *ses)
+int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
+				    struct cifs_ses *ses)
 {
 	unsigned int tioffset; /* challenge message target info area */
 	unsigned int tilen; /* challenge message target info area length  */
@@ -386,16 +285,18 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-		cERROR(1, "challenge blob len %d too small", blob_len);
+		cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len);
 		return -EINVAL;
 	}
 
 	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-		cERROR(1, "blob signature incorrect %s", pblob->Signature);
+		cifs_dbg(VFS, "blob signature incorrect %s\n",
+			 pblob->Signature);
 		return -EINVAL;
 	}
 	if (pblob->MessageType != NtLmChallenge) {
-		cERROR(1, "Incorrect message type %d", pblob->MessageType);
+		cifs_dbg(VFS, "Incorrect message type %d\n",
+			 pblob->MessageType);
 		return -EINVAL;
 	}
 
@@ -405,15 +306,20 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
 	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
-	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
-	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
+	tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+	if (tioffset > blob_len || tioffset + tilen > blob_len) {
+		cifs_dbg(VFS, "tioffset + tilen too high %u + %u",
+			tioffset, tilen);
+		return -EINVAL;
+	}
 	if (tilen) {
-		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+		ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen,
+						 GFP_KERNEL);
 		if (!ses->auth_key.response) {
-			cERROR(1, "Challenge target info allocation failure");
+			cifs_dbg(VFS, "Challenge target info alloc failure");
 			return -ENOMEM;
 		}
-		memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
 		ses->auth_key.len = tilen;
 	}
 
@@ -424,8 +330,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 
 /* We do not malloc the blob, it is passed in pbuffer, because
    it is fixed size, and small, making this approach cleaner */
-static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
-					 struct cifsSesInfo *ses)
+void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
+					 struct cifs_ses *ses)
 {
 	NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
 	__u32 flags;
@@ -438,12 +344,11 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
 		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-	if (ses->server->secMode &
-			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+	if (ses->server->sign) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-		if (!ses->server->session_estab)
-			flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
-				NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+		if (!ses->server->session_estab ||
+				ses->ntlmssp->sesskey_per_smbsess)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
 	}
 
 	sec_blob->NegotiateFlags = cpu_to_le32(flags);
@@ -461,9 +366,9 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 /* We do not malloc the blob, it is passed in pbuffer, because its
    maximum possible size is fixed and small, making this approach cleaner.
    This function returns the length of the data in the blob */
-static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 					u16 *buflen,
-				   struct cifsSesInfo *ses,
+				   struct cifs_ses *ses,
 				   const struct nls_table *nls_cp)
 {
 	int rc;
@@ -478,11 +383,12 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
 		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-	if (ses->server->secMode &
-	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (ses->server->sign) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-	if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
-		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+		if (!ses->server->session_estab ||
+				ses->ntlmssp->sesskey_per_smbsess)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+	}
 
 	tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
 	sec_blob->NegotiateFlags = cpu_to_le32(flags);
@@ -495,7 +401,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
 	rc = setup_ntlmv2_rsp(ses, nls_cp);
 	if (rc) {
-		cERROR(1, "Error %d during NTLMSSP authentication", rc);
+		cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
 		goto setup_ntlmv2_ret;
 	}
 	memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
@@ -514,8 +420,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		tmp += 2;
 	} else {
 		int len;
-		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
-				    MAX_USERNAME_SIZE, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+				      CIFS_MAX_USERNAME_LEN, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -523,15 +429,15 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		tmp += len;
 	}
 
-	if (ses->userName == NULL) {
+	if (ses->user_name == NULL) {
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = 0;
 		sec_blob->UserName.MaximumLength = 0;
 		tmp += 2;
 	} else {
 		int len;
-		len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
-				    MAX_USERNAME_SIZE, nls_cp);
+		len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+				      CIFS_MAX_USERNAME_LEN, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
@@ -564,8 +470,58 @@ setup_ntlmv2_ret:
 	return rc;
 }
 
+enum securityEnum
+select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
+{
+	switch (server->negflavor) {
+	case CIFS_NEGFLAVOR_EXTENDED:
+		switch (requested) {
+		case Kerberos:
+		case RawNTLMSSP:
+			return requested;
+		case Unspecified:
+			if (server->sec_ntlmssp &&
+			    (global_secflags & CIFSSEC_MAY_NTLMSSP))
+				return RawNTLMSSP;
+			if ((server->sec_kerberos || server->sec_mskerberos) &&
+			    (global_secflags & CIFSSEC_MAY_KRB5))
+				return Kerberos;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	case CIFS_NEGFLAVOR_UNENCAP:
+		switch (requested) {
+		case NTLM:
+		case NTLMv2:
+			return requested;
+		case Unspecified:
+			if (global_secflags & CIFSSEC_MAY_NTLMV2)
+				return NTLMv2;
+			if (global_secflags & CIFSSEC_MAY_NTLM)
+				return NTLM;
+		default:
+			/* Fallthrough to attempt LANMAN authentication next */
+			break;
+		}
+	case CIFS_NEGFLAVOR_LANMAN:
+		switch (requested) {
+		case LANMAN:
+			return requested;
+		case Unspecified:
+			if (global_secflags & CIFSSEC_MAY_LANMAN)
+				return LANMAN;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	default:
+		return Unspecified;
+	}
+}
+
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 	       const struct nls_table *nls_cp)
 {
 	int rc = 0;
@@ -585,11 +541,19 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	u16 blob_len;
 	char *ntlmsspblob = NULL;
 
-	if (ses == NULL)
+	if (ses == NULL) {
+		WARN(1, "%s: ses == NULL!", __func__);
 		return -EINVAL;
+	}
+
+	type = select_sectype(ses->server, ses->sectype);
+	cifs_dbg(FYI, "sess setup type %d\n", type);
+	if (type == Unspecified) {
+		cifs_dbg(VFS,
+			"Unable to select appropriate authentication method!");
+		return -EINVAL;
+	}
 
-	type = ses->server->secType;
-	cFYI(1, "sess setup type %d", type);
 	if (type == RawNTLMSSP) {
 		/* if memory allocation is successful, caller of this function
 		 * frees it.
@@ -597,6 +561,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 		if (!ses->ntlmssp)
 			return -ENOMEM;
+		ses->ntlmssp->sesskey_per_smbsess = false;
+
 	}
 
 ssetup_ntlmssp_authenticate:
@@ -635,7 +601,7 @@ ssetup_ntlmssp_authenticate:
 	and rest of bcc area. This allows us to avoid
 	a large buffer 17K allocation */
 	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = smb_buf->smb_buf_length + 4;
+	iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
 
 	/* setting this here allows the code at the end of the function
 	   to free the request buffer if there's an error */
@@ -649,8 +615,6 @@ ssetup_ntlmssp_authenticate:
 	}
 	bcc_ptr = str_area;
 
-	ses->flags &= ~CIFS_SES_LANMAN;
-
 	iov[1].iov_base = NULL;
 	iov[1].iov_len = 0;
 
@@ -670,11 +634,10 @@ ssetup_ntlmssp_authenticate:
 		 * to use challenge/response method (i.e. Password bit is 1).
 		 */
 
-		calc_lanman_hash(ses->password, ses->server->cryptkey,
-				 ses->server->secMode & SECMODE_PW_ENCRYPT ?
+		rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+				 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
 					true : false, lnm_session_key);
 
-		ses->flags |= CIFS_SES_LANMAN;
 		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
 		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 
@@ -683,7 +646,7 @@ ssetup_ntlmssp_authenticate:
 		changed to do higher than lanman dialect and
 		we reconnected would we ever calc signing_key? */
 
-		cFYI(1, "Negotiating LANMAN setting up strings");
+		cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
 		/* Unicode not allowed for LANMAN dialects */
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -695,9 +658,10 @@ ssetup_ntlmssp_authenticate:
 			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 
 		/* calculate ntlm response and session key */
-		rc = setup_ntlm_response(ses);
+		rc = setup_ntlm_response(ses, nls_cp);
 		if (rc) {
-			cERROR(1, "Error %d during NTLM authentication", rc);
+			cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+				 rc);
 			goto ssetup_exit;
 		}
 
@@ -727,7 +691,8 @@ ssetup_ntlmssp_authenticate:
 		/* calculate nlmv2 response and session key */
 		rc = setup_ntlmv2_rsp(ses, nls_cp);
 		if (rc) {
-			cERROR(1, "Error %d during NTLMv2 authentication", rc);
+			cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
+				 rc);
 			goto ssetup_exit;
 		}
 		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
@@ -763,21 +728,22 @@ ssetup_ntlmssp_authenticate:
 		/* check version field to make sure that cifs.upcall is
 		   sending us a response in an expected form */
 		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-			cERROR(1, "incorrect version of cifs.upcall (expected"
-				   " %d but got %d)",
+			cifs_dbg(VFS, "incorrect version of cifs.upcall "
+				   "expected %d but got %d)",
 				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 			rc = -EKEYREJECTED;
 			goto ssetup_exit;
 		}
 
-		ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
+		ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+						 GFP_KERNEL);
 		if (!ses->auth_key.response) {
-			cERROR(1, "Kerberos can't allocate (%u bytes) memory",
-					msg->sesskey_len);
+			cifs_dbg(VFS,
+				"Kerberos can't allocate (%u bytes) memory",
+				msg->sesskey_len);
 			rc = -ENOMEM;
 			goto ssetup_exit;
 		}
-		memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
 		ses->auth_key.len = msg->sesskey_len;
 
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -799,18 +765,18 @@ ssetup_ntlmssp_authenticate:
 		/* BB: is this right? */
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-		cERROR(1, "Kerberos negotiated but upcall support disabled!");
+		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
 	} else if (type == RawNTLMSSP) {
 		if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-			cERROR(1, "NTLMSSP requires Unicode support");
+			cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
 
-		cFYI(1, "ntlmssp session setup phase %d", phase);
+		cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -833,7 +799,6 @@ ssetup_ntlmssp_authenticate:
 				5*sizeof(struct _AUTHENTICATE_MESSAGE),
 				GFP_KERNEL);
 			if (!ntlmsspblob) {
-				cERROR(1, "Can't allocate NTLMSSP blob");
 				rc = -ENOMEM;
 				goto ssetup_exit;
 			}
@@ -853,7 +818,7 @@ ssetup_ntlmssp_authenticate:
 			smb_buf->Uid = ses->Suid;
 			break;
 		default:
-			cERROR(1, "invalid phase %d", phase);
+			cifs_dbg(VFS, "invalid phase %d\n", phase);
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
@@ -864,7 +829,7 @@ ssetup_ntlmssp_authenticate:
 		}
 		unicode_oslm_strings(&bcc_ptr, nls_cp);
 	} else {
-		cERROR(1, "secType %d not supported!", type);
+		cifs_dbg(VFS, "secType %d not supported!\n", type);
 		rc = -ENOSYS;
 		goto ssetup_exit;
 	}
@@ -873,9 +838,10 @@ ssetup_ntlmssp_authenticate:
 	iov[2].iov_len = (long) bcc_ptr - (long) str_area;
 
 	count = iov[1].iov_len + iov[2].iov_len;
-	smb_buf->smb_buf_length += count;
+	smb_buf->smb_buf_length =
+		cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
 
-	put_bcc_le(count, smb_buf);
+	put_bcc(count, smb_buf);
 
 	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
 			  CIFS_LOG_ERROR);
@@ -884,10 +850,11 @@ ssetup_ntlmssp_authenticate:
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 	smb_buf = (struct smb_hdr *)iov[0].iov_base;
 
-	if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
+	if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
+	    (smb_buf->Status.CifsError ==
 			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
 		if (phase != NtLmNegotiate) {
-			cERROR(1, "Unexpected more processing error");
+			cifs_dbg(VFS, "Unexpected more processing error\n");
 			goto ssetup_exit;
 		}
 		/* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -899,14 +866,14 @@ ssetup_ntlmssp_authenticate:
 
 	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 		rc = -EIO;
-		cERROR(1, "bad word count %d", smb_buf->WordCount);
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 		goto ssetup_exit;
 	}
 	action = le16_to_cpu(pSMB->resp.Action);
 	if (action & GUEST_LOGIN)
-		cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-	cFYI(1, "UID = %d ", ses->Suid);
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 	/* response can have either 3 or 4 word count - Samba sends 3 */
 	/* and lanman response is 3 */
 	bytes_remaining = get_bcc(smb_buf);
@@ -915,7 +882,8 @@ ssetup_ntlmssp_authenticate:
 	if (smb_buf->WordCount == 4) {
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 		if (blob_len > bytes_remaining) {
-			cERROR(1, "bad security blob length %d", blob_len);
+			cifs_dbg(VFS, "bad security blob length %d\n",
+				 blob_len);
 			rc = -EINVAL;
 			goto ssetup_exit;
 		}
@@ -930,7 +898,9 @@ ssetup_ntlmssp_authenticate:
 	}
 
 	/* BB check if Unicode and decode strings */
-	if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+	if (bytes_remaining == 0) {
+		/* no string area to decode, do nothing */
+	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 		/* unicode string area must be word-aligned */
 		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
 			++bcc_ptr;
@@ -938,20 +908,19 @@ ssetup_ntlmssp_authenticate:
 		}
 		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 	} else {
-		rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
-					 ses, nls_cp);
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 	}
 
 ssetup_exit:
 	if (spnego_key) {
-		key_revoke(spnego_key);
+		key_invalidate(spnego_key);
 		key_put(spnego_key);
 	}
 	kfree(str_area);
 	kfree(ntlmsspblob);
 	ntlmsspblob = NULL;
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
-		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
+		cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
 		cifs_small_buf_release(iov[0].iov_base);
 	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 		cifs_buf_release(iov[0].iov_base);
@@ -960,5 +929,37 @@ ssetup_exit:
 	if ((phase == NtLmChallenge) && (rc == 0))
 		goto ssetup_ntlmssp_authenticate;
 
+	if (!rc) {
+		mutex_lock(&ses->server->srv_mutex);
+		if (!ses->server->session_estab) {
+			if (ses->server->sign) {
+				ses->server->session_key.response =
+					kmemdup(ses->auth_key.response,
+					ses->auth_key.len, GFP_KERNEL);
+				if (!ses->server->session_key.response) {
+					rc = -ENOMEM;
+					mutex_unlock(&ses->server->srv_mutex);
+					goto keycp_exit;
+				}
+				ses->server->session_key.len =
+							ses->auth_key.len;
+			}
+			ses->server->sequence_number = 0x2;
+			ses->server->session_estab = true;
+		}
+		mutex_unlock(&ses->server->srv_mutex);
+
+		cifs_dbg(FYI, "CIFS session established successfully\n");
+		spin_lock(&GlobalMid_Lock);
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+keycp_exit:
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	kfree(ses->ntlmssp);
+
 	return rc;
 }
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
new file mode 100644
index 00000000000..d1fdfa84870
--- /dev/null
+++ b/fs/cifs/smb1ops.c
@@ -0,0 +1,1107 @@
+/*
+ *  SMB1 (CIFS) version specific operations
+ *
+ *  Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ *  This library is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License v2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *  the GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifspdu.h"
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, void *buf,
+	       struct mid_q_entry *mid)
+{
+	int rc = 0;
+	struct smb_hdr *in_buf = (struct smb_hdr *)buf;
+
+	/* -4 for RFC1001 length and +2 for BCC field */
+	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
+	in_buf->Command = SMB_COM_NT_CANCEL;
+	in_buf->WordCount = 0;
+	put_bcc(0, in_buf);
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		return rc;
+	}
+
+	/*
+	 * The response to this call was already factored into the sequence
+	 * number when the call went out, so we must adjust it back downward
+	 * after signing here.
+	 */
+	--server->sequence_number;
+	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	if (rc < 0)
+		server->sequence_number--;
+
+	mutex_unlock(&server->srv_mutex);
+
+	cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
+		 get_mid(in_buf), rc);
+
+	return rc;
+}
+
+static bool
+cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
+{
+	return ob1->fid.netfid == ob2->fid.netfid;
+}
+
+static unsigned int
+cifs_read_data_offset(char *buf)
+{
+	READ_RSP *rsp = (READ_RSP *)buf;
+	return le16_to_cpu(rsp->DataOffset);
+}
+
+static unsigned int
+cifs_read_data_length(char *buf)
+{
+	READ_RSP *rsp = (READ_RSP *)buf;
+	return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
+	       le16_to_cpu(rsp->DataLength);
+}
+
+static struct mid_q_entry *
+cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
+{
+	struct smb_hdr *buf = (struct smb_hdr *)buffer;
+	struct mid_q_entry *mid;
+
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+		if (compare_mid(mid->mid, buf) &&
+		    mid->mid_state == MID_REQUEST_SUBMITTED &&
+		    le16_to_cpu(mid->command) == buf->Command) {
+			spin_unlock(&GlobalMid_Lock);
+			return mid;
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return NULL;
+}
+
+static void
+cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
+		 const int optype)
+{
+	spin_lock(&server->req_lock);
+	server->credits += add;
+	server->in_flight--;
+	spin_unlock(&server->req_lock);
+	wake_up(&server->request_q);
+}
+
+static void
+cifs_set_credits(struct TCP_Server_Info *server, const int val)
+{
+	spin_lock(&server->req_lock);
+	server->credits = val;
+	server->oplocks = val > 1 ? enable_oplocks : false;
+	spin_unlock(&server->req_lock);
+}
+
+static int *
+cifs_get_credits_field(struct TCP_Server_Info *server, const int optype)
+{
+	return &server->credits;
+}
+
+static unsigned int
+cifs_get_credits(struct mid_q_entry *mid)
+{
+	return 1;
+}
+
+/*
+ * Find a free multiplex id (SMB mid). Otherwise there could be
+ * mid collisions which might cause problems, demultiplexing the
+ * wrong response to this request. Multiplex ids could collide if
+ * one of a series requests takes much longer than the others, or
+ * if a very large number of long lived requests (byte range
+ * locks or FindNotify requests) are pending. No more than
+ * 64K-1 requests can be outstanding at one time. If no
+ * mids are available, return zero. A future optimization
+ * could make the combination of mids and uid the key we use
+ * to demultiplex on (rather than mid alone).
+ * In addition to the above check, the cifs demultiplex
+ * code already used the command code as a secondary
+ * check of the frame and if signing is negotiated the
+ * response would be discarded if the mid were the same
+ * but the signature was wrong. Since the mid is not put in the
+ * pending queue until later (when it is about to be dispatched)
+ * we do have to limit the number of outstanding requests
+ * to somewhat less than 64K-1 although it is hard to imagine
+ * so many threads being in the vfs at one time.
+ */
+static __u64
+cifs_get_next_mid(struct TCP_Server_Info *server)
+{
+	__u64 mid = 0;
+	__u16 last_mid, cur_mid;
+	bool collision;
+
+	spin_lock(&GlobalMid_Lock);
+
+	/* mid is 16 bit only for CIFS/SMB */
+	cur_mid = (__u16)((server->CurrentMid) & 0xffff);
+	/* we do not want to loop forever */
+	last_mid = cur_mid;
+	cur_mid++;
+
+	/*
+	 * This nested loop looks more expensive than it is.
+	 * In practice the list of pending requests is short,
+	 * fewer than 50, and the mids are likely to be unique
+	 * on the first pass through the loop unless some request
+	 * takes longer than the 64 thousand requests before it
+	 * (and it would also have to have been a request that
+	 * did not time out).
+	 */
+	while (cur_mid != last_mid) {
+		struct mid_q_entry *mid_entry;
+		unsigned int num_mids;
+
+		collision = false;
+		if (cur_mid == 0)
+			cur_mid++;
+
+		num_mids = 0;
+		list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+			++num_mids;
+			if (mid_entry->mid == cur_mid &&
+			    mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
+				/* This mid is in use, try a different one */
+				collision = true;
+				break;
+			}
+		}
+
+		/*
+		 * if we have more than 32k mids in the list, then something
+		 * is very wrong. Possibly a local user is trying to DoS the
+		 * box by issuing long-running calls and SIGKILL'ing them. If
+		 * we get to 2^16 mids then we're in big trouble as this
+		 * function could loop forever.
+		 *
+		 * Go ahead and assign out the mid in this situation, but force
+		 * an eventual reconnect to clean out the pending_mid_q.
+		 */
+		if (num_mids > 32768)
+			server->tcpStatus = CifsNeedReconnect;
+
+		if (!collision) {
+			mid = (__u64)cur_mid;
+			server->CurrentMid = mid;
+			break;
+		}
+		cur_mid++;
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return mid;
+}
+
+/*
+	return codes:
+		0	not a transact2, or all data present
+		>0	transact2 with that much data missing
+		-EINVAL	invalid transact2
+ */
+static int
+check2ndT2(char *buf)
+{
+	struct smb_hdr *pSMB = (struct smb_hdr *)buf;
+	struct smb_t2_rsp *pSMBt;
+	int remaining;
+	__u16 total_data_size, data_in_this_rsp;
+
+	if (pSMB->Command != SMB_COM_TRANSACTION2)
+		return 0;
+
+	/* check for plausible wct, bcc and t2 data and parm sizes */
+	/* check for parm and data offset going beyond end of smb */
+	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
+		cifs_dbg(FYI, "invalid transact2 word count\n");
+		return -EINVAL;
+	}
+
+	pSMBt = (struct smb_t2_rsp *)pSMB;
+
+	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+	data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+
+	if (total_data_size == data_in_this_rsp)
+		return 0;
+	else if (total_data_size < data_in_this_rsp) {
+		cifs_dbg(FYI, "total data %d smaller than data in frame %d\n",
+			 total_data_size, data_in_this_rsp);
+		return -EINVAL;
+	}
+
+	remaining = total_data_size - data_in_this_rsp;
+
+	cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n",
+		 remaining);
+	if (total_data_size > CIFSMaxBufSize) {
+		cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n",
+			 total_data_size, CIFSMaxBufSize);
+		return -EINVAL;
+	}
+	return remaining;
+}
+
+static int
+coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
+{
+	struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)second_buf;
+	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)target_hdr;
+	char *data_area_of_tgt;
+	char *data_area_of_src;
+	int remaining;
+	unsigned int byte_count, total_in_tgt;
+	__u16 tgt_total_cnt, src_total_cnt, total_in_src;
+
+	src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+	tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+
+	if (tgt_total_cnt != src_total_cnt)
+		cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n",
+			 src_total_cnt, tgt_total_cnt);
+
+	total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+
+	remaining = tgt_total_cnt - total_in_tgt;
+
+	if (remaining < 0) {
+		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+			 tgt_total_cnt, total_in_tgt);
+		return -EPROTO;
+	}
+
+	if (remaining == 0) {
+		/* nothing to do, ignore */
+		cifs_dbg(FYI, "no more data remains\n");
+		return 0;
+	}
+
+	total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+	if (remaining < total_in_src)
+		cifs_dbg(FYI, "transact2 2nd response contains too much data\n");
+
+	/* find end of first SMB data area */
+	data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
+				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
+
+	/* validate target area */
+	data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+				get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
+
+	data_area_of_tgt += total_in_tgt;
+
+	total_in_tgt += total_in_src;
+	/* is the result too big for the field? */
+	if (total_in_tgt > USHRT_MAX) {
+		cifs_dbg(FYI, "coalesced DataCount too large (%u)\n",
+			 total_in_tgt);
+		return -EPROTO;
+	}
+	put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
+
+	/* fix up the BCC */
+	byte_count = get_bcc(target_hdr);
+	byte_count += total_in_src;
+	/* is the result too big for the field? */
+	if (byte_count > USHRT_MAX) {
+		cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count);
+		return -EPROTO;
+	}
+	put_bcc(byte_count, target_hdr);
+
+	byte_count = be32_to_cpu(target_hdr->smb_buf_length);
+	byte_count += total_in_src;
+	/* don't allow buffer to overflow */
+	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n",
+			 byte_count);
+		return -ENOBUFS;
+	}
+	target_hdr->smb_buf_length = cpu_to_be32(byte_count);
+
+	/* copy second buffer into end of first buffer */
+	memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
+
+	if (remaining != total_in_src) {
+		/* more responses to go */
+		cifs_dbg(FYI, "waiting for more secondary responses\n");
+		return 1;
+	}
+
+	/* we are done */
+	cifs_dbg(FYI, "found the last secondary response\n");
+	return 0;
+}
+
+static void
+cifs_downgrade_oplock(struct TCP_Server_Info *server,
+			struct cifsInodeInfo *cinode, bool set_level2)
+{
+	if (set_level2)
+		cifs_set_oplock_level(cinode, OPLOCK_READ);
+	else
+		cifs_set_oplock_level(cinode, 0);
+}
+
+static bool
+cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		  char *buf, int malformed)
+{
+	if (malformed)
+		return false;
+	if (check2ndT2(buf) <= 0)
+		return false;
+	mid->multiRsp = true;
+	if (mid->resp_buf) {
+		/* merge response - fix up 1st*/
+		malformed = coalesce_t2(buf, mid->resp_buf);
+		if (malformed > 0)
+			return true;
+		/* All parts received or packet is malformed. */
+		mid->multiEnd = true;
+		dequeue_mid(mid, malformed);
+		return true;
+	}
+	if (!server->large_buf) {
+		/*FIXME: switch to already allocated largebuf?*/
+		cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n");
+	} else {
+		/* Have first buffer */
+		mid->resp_buf = buf;
+		mid->large_buf = true;
+		server->bigbuf = NULL;
+	}
+	return true;
+}
+
+static bool
+cifs_need_neg(struct TCP_Server_Info *server)
+{
+	return server->maxBuf == 0;
+}
+
+static int
+cifs_negotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	int rc;
+	rc = CIFSSMBNegotiate(xid, ses);
+	if (rc == -EAGAIN) {
+		/* retry only once on 1st time connection */
+		set_credits(ses->server, 1);
+		rc = CIFSSMBNegotiate(xid, ses);
+		if (rc == -EAGAIN)
+			rc = -EHOSTDOWN;
+	}
+	return rc;
+}
+
+static unsigned int
+cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	if (volume_info->wsize)
+		wsize = volume_info->wsize;
+	else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = CIFS_DEFAULT_IOSIZE;
+	else
+		wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
+
+	/* can server support 24-bit write sizes? (via UNIX extensions) */
+	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+		wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE);
+
+	/*
+	 * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set?
+	 * Limit it to max buffer offered by the server, minus the size of the
+	 * WRITEX header, not including the 4 byte RFC1001 length.
+	 */
+	if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
+	    (!(server->capabilities & CAP_UNIX) && server->sign))
+		wsize = min_t(unsigned int, wsize,
+				server->maxBuf - sizeof(WRITE_REQ) + 4);
+
+	/* hard limit of CIFS_MAX_WSIZE */
+	wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
+
+	return wsize;
+}
+
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int rsize, defsize;
+
+	/*
+	 * Set default value...
+	 *
+	 * HACK alert! Ancient servers have very small buffers. Even though
+	 * MS-CIFS indicates that servers are only limited by the client's
+	 * bufsize for reads, testing against win98se shows that it throws
+	 * INVALID_PARAMETER errors if you try to request too large a read.
+	 * OS/2 just sends back short reads.
+	 *
+	 * If the server doesn't advertise CAP_LARGE_READ_X, then assume that
+	 * it can't handle a read request larger than its MaxBufferSize either.
+	 */
+	if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+		defsize = CIFS_DEFAULT_IOSIZE;
+	else if (server->capabilities & CAP_LARGE_READ_X)
+		defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+	else
+		defsize = server->maxBuf - sizeof(READ_RSP);
+
+	rsize = volume_info->rsize ? volume_info->rsize : defsize;
+
+	/*
+	 * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+	 * the client's MaxBufferSize.
+	 */
+	if (!(server->capabilities & CAP_LARGE_READ_X))
+		rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+
+	/* hard limit of CIFS_MAX_RSIZE */
+	rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+
+	return rsize;
+}
+
+static void
+cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	CIFSSMBQFSDeviceInfo(xid, tcon);
+	CIFSSMBQFSAttributeInfo(xid, tcon);
+}
+
+static int
+cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
+			struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	FILE_ALL_INFO *file_info;
+
+	file_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (file_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc == -EOPNOTSUPP || rc == -EINVAL)
+		rc = SMBQueryInformation(xid, tcon, full_path, file_info,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				  CIFS_MOUNT_MAP_SPECIAL_CHR);
+	kfree(file_info);
+	return rc;
+}
+
+static int
+cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_sb_info *cifs_sb, const char *full_path,
+		     FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink)
+{
+	int rc;
+
+	*symlink = false;
+
+	/* could do find first instead but this returns more info */
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
+			      cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	/*
+	 * BB optimize code so we do not make the above call when server claims
+	 * no NT SMB support and the above call failed at least once - set flag
+	 * in tcon or mount.
+	 */
+	if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+		rc = SMBQueryInformation(xid, tcon, full_path, data,
+					 cifs_sb->local_nls,
+					 cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		*adjustTZ = true;
+	}
+
+	if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
+		int tmprc;
+		int oplock = 0;
+		struct cifs_fid fid;
+		struct cifs_open_parms oparms;
+
+		oparms.tcon = tcon;
+		oparms.cifs_sb = cifs_sb;
+		oparms.desired_access = FILE_READ_ATTRIBUTES;
+		oparms.create_options = 0;
+		oparms.disposition = FILE_OPEN;
+		oparms.path = full_path;
+		oparms.fid = &fid;
+		oparms.reconnect = false;
+
+		/* Need to check if this is a symbolic link or not */
+		tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
+		if (tmprc == -EOPNOTSUPP)
+			*symlink = true;
+		else
+			CIFSSMBClose(xid, tcon, fid.netfid);
+	}
+
+	return rc;
+}
+
+static int
+cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+		  struct cifs_sb_info *cifs_sb, const char *full_path,
+		  u64 *uniqueid, FILE_ALL_INFO *data)
+{
+	/*
+	 * We can not use the IndexNumber field by default from Windows or
+	 * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA
+	 * CIFS spec claims that this value is unique within the scope of a
+	 * share, and the windows docs hint that it's actually unique
+	 * per-machine.
+	 *
+	 * There may be higher info levels that work but are there Windows
+	 * server or network appliances for which IndexNumber field is not
+	 * guaranteed unique?
+	 */
+	return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid,
+				     cifs_sb->local_nls,
+				     cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+}
+
+static int
+cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_fid *fid, FILE_ALL_INFO *data)
+{
+	return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
+}
+
+static void
+cifs_clear_stats(struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	atomic_set(&tcon->stats.cifs_stats.num_writes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_reads, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_flushes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_oplock_brks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_opens, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_posixopens, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_posixmkdirs, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_closes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_deletes, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_mkdirs, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_rmdirs, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_renames, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_t2renames, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_ffirst, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_fnext, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_fclose, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_hardlinks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_symlinks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_locks, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_acl_get, 0);
+	atomic_set(&tcon->stats.cifs_stats.num_acl_set, 0);
+#endif
+}
+
+static void
+cifs_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	seq_printf(m, " Oplocks breaks: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_oplock_brks));
+	seq_printf(m, "\nReads:  %d Bytes: %llu",
+		   atomic_read(&tcon->stats.cifs_stats.num_reads),
+		   (long long)(tcon->bytes_read));
+	seq_printf(m, "\nWrites: %d Bytes: %llu",
+		   atomic_read(&tcon->stats.cifs_stats.num_writes),
+		   (long long)(tcon->bytes_written));
+	seq_printf(m, "\nFlushes: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_flushes));
+	seq_printf(m, "\nLocks: %d HardLinks: %d Symlinks: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_locks),
+		   atomic_read(&tcon->stats.cifs_stats.num_hardlinks),
+		   atomic_read(&tcon->stats.cifs_stats.num_symlinks));
+	seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_opens),
+		   atomic_read(&tcon->stats.cifs_stats.num_closes),
+		   atomic_read(&tcon->stats.cifs_stats.num_deletes));
+	seq_printf(m, "\nPosix Opens: %d Posix Mkdirs: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_posixopens),
+		   atomic_read(&tcon->stats.cifs_stats.num_posixmkdirs));
+	seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_mkdirs),
+		   atomic_read(&tcon->stats.cifs_stats.num_rmdirs));
+	seq_printf(m, "\nRenames: %d T2 Renames %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_renames),
+		   atomic_read(&tcon->stats.cifs_stats.num_t2renames));
+	seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
+		   atomic_read(&tcon->stats.cifs_stats.num_ffirst),
+		   atomic_read(&tcon->stats.cifs_stats.num_fnext),
+		   atomic_read(&tcon->stats.cifs_stats.num_fclose));
+#endif
+}
+
+static void
+cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
+		   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
+		   const unsigned int xid)
+{
+	FILE_BASIC_INFO info;
+	struct cifsInodeInfo *cifsInode;
+	u32 dosattrs;
+	int rc;
+
+	memset(&info, 0, sizeof(info));
+	cifsInode = CIFS_I(inode);
+	dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+	info.Attributes = cpu_to_le32(dosattrs);
+	rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc == 0)
+		cifsInode->cifsAttrs = dosattrs;
+}
+
+static int
+cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+	       __u32 *oplock, FILE_ALL_INFO *buf)
+{
+	if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
+		return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
+				     oparms->disposition,
+				     oparms->desired_access,
+				     oparms->create_options,
+				     &oparms->fid->netfid, oplock, buf,
+				     oparms->cifs_sb->local_nls,
+				     oparms->cifs_sb->mnt_cifs_flags
+						& CIFS_MOUNT_MAP_SPECIAL_CHR);
+	return CIFS_open(xid, oparms, oplock, buf);
+}
+
+static void
+cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	cfile->fid.netfid = fid->netfid;
+	cifs_set_oplock_level(cinode, oplock);
+	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
+}
+
+static void
+cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	CIFSSMBClose(xid, tcon, fid->netfid);
+}
+
+static int
+cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	return CIFSSMBFlush(xid, tcon, fid->netfid);
+}
+
+static int
+cifs_sync_read(const unsigned int xid, struct cifsFileInfo *cfile,
+	       struct cifs_io_parms *parms, unsigned int *bytes_read,
+	       char **buf, int *buf_type)
+{
+	parms->netfid = cfile->fid.netfid;
+	return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type);
+}
+
+static int
+cifs_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
+		struct cifs_io_parms *parms, unsigned int *written,
+		struct kvec *iov, unsigned long nr_segs)
+{
+
+	parms->netfid = cfile->fid.netfid;
+	return CIFSSMBWrite2(xid, parms, written, iov, nr_segs);
+}
+
+static int
+smb_set_file_info(struct inode *inode, const char *full_path,
+		  FILE_BASIC_INFO *buf, const unsigned int xid)
+{
+	int oplock = 0;
+	int rc;
+	__u32 netpid;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = NULL;
+	struct cifs_tcon *tcon;
+
+	/* if the file is already open for write, just use that fileid */
+	open_file = find_writable_file(cinode, true);
+	if (open_file) {
+		fid.netfid = open_file->fid.netfid;
+		netpid = open_file->pid;
+		tcon = tlink_tcon(open_file->tlink);
+		goto set_via_filehandle;
+	}
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc == 0) {
+		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+		goto out;
+	} else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
+		goto out;
+	}
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
+	oparms.create_options = CREATE_NOT_DIR;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc != 0) {
+		if (rc == -EIO)
+			rc = -EINVAL;
+		goto out;
+	}
+
+	netpid = current->tgid;
+
+set_via_filehandle:
+	rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
+	if (!rc)
+		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+
+	if (open_file == NULL)
+		CIFSSMBClose(xid, tcon, fid.netfid);
+	else
+		cifsFileInfo_put(open_file);
+out:
+	if (tlink != NULL)
+		cifs_put_tlink(tlink);
+	return rc;
+}
+
+static int
+cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid);
+}
+
+static int
+cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
+		     const char *path, struct cifs_sb_info *cifs_sb,
+		     struct cifs_fid *fid, __u16 search_flags,
+		     struct cifs_search_info *srch_inf)
+{
+	return CIFSFindFirst(xid, tcon, path, cifs_sb,
+			     &fid->netfid, search_flags, srch_inf, true);
+}
+
+static int
+cifs_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon,
+		    struct cifs_fid *fid, __u16 search_flags,
+		    struct cifs_search_info *srch_inf)
+{
+	return CIFSFindNext(xid, tcon, fid->netfid, search_flags, srch_inf);
+}
+
+static int
+cifs_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
+	       struct cifs_fid *fid)
+{
+	return CIFSFindClose(xid, tcon, fid->netfid);
+}
+
+static int
+cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
+		     struct cifsInodeInfo *cinode)
+{
+	return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0,
+			   LOCKING_ANDX_OPLOCK_RELEASE, false,
+			   CIFS_CACHE_READ(cinode) ? 1 : 0);
+}
+
+static int
+cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
+	     struct kstatfs *buf)
+{
+	int rc = -EOPNOTSUPP;
+
+	buf->f_type = CIFS_MAGIC_NUMBER;
+
+	/*
+	 * We could add a second check for a QFS Unix capability bit
+	 */
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
+		rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
+
+	/*
+	 * Only need to call the old QFSInfo if failed on newer one,
+	 * e.g. by OS/2.
+	 **/
+	if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
+		rc = CIFSSMBQFSInfo(xid, tcon, buf);
+
+	/*
+	 * Some old Windows servers also do not support level 103, retry with
+	 * older level one if old server failed the previous call or we
+	 * bypassed it because we detected that this was an older LANMAN sess
+	 */
+	if (rc)
+		rc = SMBOldQFSInfo(xid, tcon, buf);
+	return rc;
+}
+
+static int
+cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
+	       __u64 length, __u32 type, int lock, int unlock, bool wait)
+{
+	return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->fid.netfid,
+			   current->tgid, length, offset, unlock, lock,
+			   (__u8)type, wait, 0);
+}
+
+static int
+cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
+		       const unsigned char *searchName, char **symlinkinfo,
+		       const struct nls_table *nls_codepage)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int rc;
+	unsigned int num_referrals = 0;
+	struct dfs_info3_param *referrals = NULL;
+
+	rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
+			  &num_referrals, &referrals, 0);
+
+	if (!rc && num_referrals > 0) {
+		*symlinkinfo = kstrndup(referrals->node_name,
+					strlen(referrals->node_name),
+					GFP_KERNEL);
+		if (!*symlinkinfo)
+			rc = -ENOMEM;
+		free_dfs_info_array(referrals, num_referrals);
+	}
+	return rc;
+#else /* No DFS support */
+	return -EREMOTE;
+#endif
+}
+
+static int
+cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, char **target_path,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc;
+	int oplock = 0;
+	struct cifs_fid fid;
+	struct cifs_open_parms oparms;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+	/* Check for unix extensions */
+	if (cap_unix(tcon->ses)) {
+		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
+					     cifs_sb->local_nls);
+		if (rc == -EREMOTE)
+			rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+						    target_path,
+						    cifs_sb->local_nls);
+
+		goto out;
+	}
+
+	oparms.tcon = tcon;
+	oparms.cifs_sb = cifs_sb;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.create_options = OPEN_REPARSE_POINT;
+	oparms.disposition = FILE_OPEN;
+	oparms.path = full_path;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
+	if (rc)
+		goto out;
+
+	rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
+				 cifs_sb->local_nls);
+	if (rc)
+		goto out_close;
+
+	convert_delimiter(*target_path, '/');
+out_close:
+	CIFSSMBClose(xid, tcon, fid.netfid);
+out:
+	if (!rc)
+		cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	return rc;
+}
+
+static bool
+cifs_is_read_op(__u32 oplock)
+{
+	return oplock == OPLOCK_READ;
+}
+
+struct smb_version_operations smb1_operations = {
+	.send_cancel = send_nt_cancel,
+	.compare_fids = cifs_compare_fids,
+	.setup_request = cifs_setup_request,
+	.setup_async_request = cifs_setup_async_request,
+	.check_receive = cifs_check_receive,
+	.add_credits = cifs_add_credits,
+	.set_credits = cifs_set_credits,
+	.get_credits_field = cifs_get_credits_field,
+	.get_credits = cifs_get_credits,
+	.get_next_mid = cifs_get_next_mid,
+	.read_data_offset = cifs_read_data_offset,
+	.read_data_length = cifs_read_data_length,
+	.map_error = map_smb_to_linux_error,
+	.find_mid = cifs_find_mid,
+	.check_message = checkSMB,
+	.dump_detail = cifs_dump_detail,
+	.clear_stats = cifs_clear_stats,
+	.print_stats = cifs_print_stats,
+	.is_oplock_break = is_valid_oplock_break,
+	.downgrade_oplock = cifs_downgrade_oplock,
+	.check_trans2 = cifs_check_trans2,
+	.need_neg = cifs_need_neg,
+	.negotiate = cifs_negotiate,
+	.negotiate_wsize = cifs_negotiate_wsize,
+	.negotiate_rsize = cifs_negotiate_rsize,
+	.sess_setup = CIFS_SessSetup,
+	.logoff = CIFSSMBLogoff,
+	.tree_connect = CIFSTCon,
+	.tree_disconnect = CIFSSMBTDis,
+	.get_dfs_refer = CIFSGetDFSRefer,
+	.qfs_tcon = cifs_qfs_tcon,
+	.is_path_accessible = cifs_is_path_accessible,
+	.query_path_info = cifs_query_path_info,
+	.query_file_info = cifs_query_file_info,
+	.get_srv_inum = cifs_get_srv_inum,
+	.set_path_size = CIFSSMBSetEOF,
+	.set_file_size = CIFSSMBSetFileSize,
+	.set_file_info = smb_set_file_info,
+	.set_compression = cifs_set_compression,
+	.echo = CIFSSMBEcho,
+	.mkdir = CIFSSMBMkDir,
+	.mkdir_setinfo = cifs_mkdir_setinfo,
+	.rmdir = CIFSSMBRmDir,
+	.unlink = CIFSSMBDelFile,
+	.rename_pending_delete = cifs_rename_pending_delete,
+	.rename = CIFSSMBRename,
+	.create_hardlink = CIFSCreateHardLink,
+	.query_symlink = cifs_query_symlink,
+	.open = cifs_open_file,
+	.set_fid = cifs_set_fid,
+	.close = cifs_close_file,
+	.flush = cifs_flush_file,
+	.async_readv = cifs_async_readv,
+	.async_writev = cifs_async_writev,
+	.sync_read = cifs_sync_read,
+	.sync_write = cifs_sync_write,
+	.query_dir_first = cifs_query_dir_first,
+	.query_dir_next = cifs_query_dir_next,
+	.close_dir = cifs_close_dir,
+	.calc_smb_size = smbCalcSize,
+	.oplock_response = cifs_oplock_response,
+	.queryfs = cifs_queryfs,
+	.mand_lock = cifs_mand_lock,
+	.mand_unlock_range = cifs_unlock_range,
+	.push_mand_locks = cifs_push_mandatory_locks,
+	.query_mf_symlink = cifs_query_mf_symlink,
+	.create_mf_symlink = cifs_create_mf_symlink,
+	.is_read_op = cifs_is_read_op,
+#ifdef CONFIG_CIFS_XATTR
+	.query_all_EAs = CIFSSMBQAllEAs,
+	.set_EA = CIFSSMBSetEA,
+#endif /* CIFS_XATTR */
+#ifdef CONFIG_CIFS_ACL
+	.get_acl = get_cifs_acl,
+	.get_acl_by_fid = get_cifs_acl_by_fid,
+	.set_acl = set_cifs_acl,
+#endif /* CIFS_ACL */
+};
+
+struct smb_version_values smb1_values = {
+	.version_string = SMB1_VERSION_STRING,
+	.large_lock_type = LOCKING_ANDX_LARGE_FILES,
+	.exclusive_lock_type = 0,
+	.shared_lock_type = LOCKING_ANDX_SHARED_LOCK,
+	.unlock_lock_type = 0,
+	.header_size = sizeof(struct smb_hdr),
+	.max_header_size = MAX_CIFS_HDR_SIZE,
+	.read_rsp_size = sizeof(READ_RSP),
+	.lock_cmd = cpu_to_le16(SMB_COM_LOCKING_ANDX),
+	.cap_unix = CAP_UNIX,
+	.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
+	.cap_large_files = CAP_LARGE_FILES,
+	.signing_enabled = SECMODE_SIGN_ENABLED,
+	.signing_required = SECMODE_SIGN_REQUIRED,
+};
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
new file mode 100644
index 00000000000..3f17b455083
--- /dev/null
+++ b/fs/cifs/smb2file.c
@@ -0,0 +1,265 @@
+/*
+ *   fs/cifs/smb2file.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *   Author(s): Steve French (sfrench@us.ibm.com),
+ *              Pavel Shilovsky ((pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "fscache.h"
+#include "smb2proto.h"
+
+int
+smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
+	       __u32 *oplock, FILE_ALL_INFO *buf)
+{
+	int rc;
+	__le16 *smb2_path;
+	struct smb2_file_all_info *smb2_data = NULL;
+	__u8 smb2_oplock[17];
+	struct cifs_fid *fid = oparms->fid;
+
+	smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
+	if (smb2_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+			    GFP_KERNEL);
+	if (smb2_data == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	oparms->desired_access |= FILE_READ_ATTRIBUTES;
+	*smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
+
+	if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+		memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
+
+	rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL);
+	if (rc)
+		goto out;
+
+	if (buf) {
+		/* open response does not have IndexNumber field - get it */
+		rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
+				      fid->volatile_fid,
+				      &smb2_data->IndexNumber);
+		if (rc) {
+			/* let get_inode_info disable server inode numbers */
+			smb2_data->IndexNumber = 0;
+			rc = 0;
+		}
+		move_smb2_info_to_cifs(buf, smb2_data);
+	}
+
+	*oplock = *smb2_oplock;
+out:
+	kfree(smb2_data);
+	kfree(smb2_path);
+	return rc;
+}
+
+int
+smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
+		  const unsigned int xid)
+{
+	int rc = 0, stored_rc;
+	unsigned int max_num, num = 0, max_buf;
+	struct smb2_lock_element *buf, *cur;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifsLockInfo *li, *tmp;
+	__u64 length = 1 + flock->fl_end - flock->fl_start;
+	struct list_head tmp_llist;
+
+	INIT_LIST_HEAD(&tmp_llist);
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tcon->ses->server->maxBuf;
+	if (!max_buf)
+		return -EINVAL;
+
+	max_num = max_buf / sizeof(struct smb2_lock_element);
+	buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	cur = buf;
+
+	down_write(&cinode->lock_sem);
+	list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
+		if (flock->fl_start > li->offset ||
+		    (flock->fl_start + length) <
+		    (li->offset + li->length))
+			continue;
+		if (current->tgid != li->pid)
+			continue;
+		if (cinode->can_cache_brlcks) {
+			/*
+			 * We can cache brlock requests - simply remove a lock
+			 * from the file's list.
+			 */
+			list_del(&li->llist);
+			cifs_del_lock_waiters(li);
+			kfree(li);
+			continue;
+		}
+		cur->Length = cpu_to_le64(li->length);
+		cur->Offset = cpu_to_le64(li->offset);
+		cur->Flags = cpu_to_le32(SMB2_LOCKFLAG_UNLOCK);
+		/*
+		 * We need to save a lock here to let us add it again to the
+		 * file's list if the unlock range request fails on the server.
+		 */
+		list_move(&li->llist, &tmp_llist);
+		if (++num == max_num) {
+			stored_rc = smb2_lockv(xid, tcon,
+					       cfile->fid.persistent_fid,
+					       cfile->fid.volatile_fid,
+					       current->tgid, num, buf);
+			if (stored_rc) {
+				/*
+				 * We failed on the unlock range request - add
+				 * all locks from the tmp list to the head of
+				 * the file's list.
+				 */
+				cifs_move_llist(&tmp_llist,
+						&cfile->llist->locks);
+				rc = stored_rc;
+			} else
+				/*
+				 * The unlock range request succeed - free the
+				 * tmp list.
+				 */
+				cifs_free_llist(&tmp_llist);
+			cur = buf;
+			num = 0;
+		} else
+			cur++;
+	}
+	if (num) {
+		stored_rc = smb2_lockv(xid, tcon, cfile->fid.persistent_fid,
+				       cfile->fid.volatile_fid, current->tgid,
+				       num, buf);
+		if (stored_rc) {
+			cifs_move_llist(&tmp_llist, &cfile->llist->locks);
+			rc = stored_rc;
+		} else
+			cifs_free_llist(&tmp_llist);
+	}
+	up_write(&cinode->lock_sem);
+
+	kfree(buf);
+	return rc;
+}
+
+static int
+smb2_push_mand_fdlocks(struct cifs_fid_locks *fdlocks, const unsigned int xid,
+		       struct smb2_lock_element *buf, unsigned int max_num)
+{
+	int rc = 0, stored_rc;
+	struct cifsFileInfo *cfile = fdlocks->cfile;
+	struct cifsLockInfo *li;
+	unsigned int num = 0;
+	struct smb2_lock_element *cur = buf;
+	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+
+	list_for_each_entry(li, &fdlocks->locks, llist) {
+		cur->Length = cpu_to_le64(li->length);
+		cur->Offset = cpu_to_le64(li->offset);
+		cur->Flags = cpu_to_le32(li->type |
+						SMB2_LOCKFLAG_FAIL_IMMEDIATELY);
+		if (++num == max_num) {
+			stored_rc = smb2_lockv(xid, tcon,
+					       cfile->fid.persistent_fid,
+					       cfile->fid.volatile_fid,
+					       current->tgid, num, buf);
+			if (stored_rc)
+				rc = stored_rc;
+			cur = buf;
+			num = 0;
+		} else
+			cur++;
+	}
+	if (num) {
+		stored_rc = smb2_lockv(xid, tcon,
+				       cfile->fid.persistent_fid,
+				       cfile->fid.volatile_fid,
+				       current->tgid, num, buf);
+		if (stored_rc)
+			rc = stored_rc;
+	}
+
+	return rc;
+}
+
+int
+smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+	int rc = 0, stored_rc;
+	unsigned int xid;
+	unsigned int max_num, max_buf;
+	struct smb2_lock_element *buf;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct cifs_fid_locks *fdlocks;
+
+	xid = get_xid();
+
+	/*
+	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
+	 * and check it for zero before using.
+	 */
+	max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
+	if (!max_buf) {
+		free_xid(xid);
+		return -EINVAL;
+	}
+
+	max_num = max_buf / sizeof(struct smb2_lock_element);
+	buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
+	if (!buf) {
+		free_xid(xid);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(fdlocks, &cinode->llist, llist) {
+		stored_rc = smb2_push_mand_fdlocks(fdlocks, xid, buf, max_num);
+		if (stored_rc)
+			rc = stored_rc;
+	}
+
+	kfree(buf);
+	free_xid(xid);
+	return rc;
+}
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
new file mode 100644
index 00000000000..bc0bb9c34f7
--- /dev/null
+++ b/fs/cifs/smb2glob.h
@@ -0,0 +1,63 @@
+/*
+ *   fs/cifs/smb2glob.h
+ *
+ *   Definitions for various global variables and structures
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+#ifndef _SMB2_GLOB_H
+#define _SMB2_GLOB_H
+
+#define SMB2_MAGIC_NUMBER 0xFE534D42
+
+/*
+ *****************************************************************
+ * Constants go here
+ *****************************************************************
+ */
+
+/*
+ * Identifiers for functions that use the open, operation, close pattern
+ * in smb2inode.c:smb2_open_op_close()
+ */
+#define SMB2_OP_SET_DELETE 1
+#define SMB2_OP_SET_INFO 2
+#define SMB2_OP_QUERY_INFO 3
+#define SMB2_OP_QUERY_DIR 4
+#define SMB2_OP_MKDIR 5
+#define SMB2_OP_RENAME 6
+#define SMB2_OP_DELETE 7
+#define SMB2_OP_HARDLINK 8
+#define SMB2_OP_SET_EOF 9
+
+/* Used when constructing chained read requests. */
+#define CHAINED_REQUEST 1
+#define START_OF_CHAIN 2
+#define END_OF_CHAIN 4
+#define RELATED_REQUEST 8
+
+#define SMB2_SIGNATURE_SIZE (16)
+#define SMB2_NTLMV2_SESSKEY_SIZE (16)
+#define SMB2_HMACSHA256_SIZE (32)
+#define SMB2_CMACAES_SIZE (16)
+#define SMB3_SIGNKEY_SIZE (16)
+
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
+#endif	/* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
new file mode 100644
index 00000000000..84c012a6aba
--- /dev/null
+++ b/fs/cifs/smb2inode.c
@@ -0,0 +1,273 @@
+/*
+ *   fs/cifs/smb2inode.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Pavel Shilovsky (pshilovsky@samba.org),
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include "cifsfs.h"
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#include "fscache.h"
+#include "smb2glob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+
+static int
+smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path,
+		   __u32 desired_access, __u32 create_disposition,
+		   __u32 create_options, void *data, int command)
+{
+	int rc, tmprc = 0;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = desired_access;
+	oparms.disposition = create_disposition;
+	oparms.create_options = create_options;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	if (rc) {
+		kfree(utf16_path);
+		return rc;
+	}
+
+	switch (command) {
+	case SMB2_OP_DELETE:
+		break;
+	case SMB2_OP_QUERY_INFO:
+		tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+					fid.volatile_fid,
+					(struct smb2_file_all_info *)data);
+		break;
+	case SMB2_OP_MKDIR:
+		/*
+		 * Directories are created through parameters in the
+		 * SMB2_open() call.
+		 */
+		break;
+	case SMB2_OP_RENAME:
+		tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
+				    fid.volatile_fid, (__le16 *)data);
+		break;
+	case SMB2_OP_HARDLINK:
+		tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
+					  fid.volatile_fid, (__le16 *)data);
+		break;
+	case SMB2_OP_SET_EOF:
+		tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
+				     fid.volatile_fid, current->tgid,
+				     (__le64 *)data);
+		break;
+	case SMB2_OP_SET_INFO:
+		tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
+				      fid.volatile_fid,
+				      (FILE_BASIC_INFO *)data);
+		break;
+	default:
+		cifs_dbg(VFS, "Invalid command\n");
+		break;
+	}
+
+	rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	if (tmprc)
+		rc = tmprc;
+	kfree(utf16_path);
+	return rc;
+}
+
+void
+move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
+{
+	memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src);
+	dst->CurrentByteOffset = src->CurrentByteOffset;
+	dst->Mode = src->Mode;
+	dst->AlignmentRequirement = src->AlignmentRequirement;
+	dst->IndexNumber1 = 0; /* we don't use it */
+}
+
+int
+smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_sb_info *cifs_sb, const char *full_path,
+		     FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink)
+{
+	int rc;
+	struct smb2_file_all_info *smb2_data;
+
+	*adjust_tz = false;
+	*symlink = false;
+
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+			    GFP_KERNEL);
+	if (smb2_data == NULL)
+		return -ENOMEM;
+
+	rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+				FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
+				smb2_data, SMB2_OP_QUERY_INFO);
+	if (rc == -EOPNOTSUPP) {
+		*symlink = true;
+		/* Failed on a symbolic link - query a reparse point info */
+		rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+					FILE_READ_ATTRIBUTES, FILE_OPEN,
+					OPEN_REPARSE_POINT, smb2_data,
+					SMB2_OP_QUERY_INFO);
+	}
+	if (rc)
+		goto out;
+
+	move_smb2_info_to_cifs(data, smb2_data);
+out:
+	kfree(smb2_data);
+	return rc;
+}
+
+int
+smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	   struct cifs_sb_info *cifs_sb)
+{
+	return smb2_open_op_close(xid, tcon, cifs_sb, name,
+				  FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+				  CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
+}
+
+void
+smb2_mkdir_setinfo(struct inode *inode, const char *name,
+		   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
+		   const unsigned int xid)
+{
+	FILE_BASIC_INFO data;
+	struct cifsInodeInfo *cifs_i;
+	u32 dosattrs;
+	int tmprc;
+
+	memset(&data, 0, sizeof(data));
+	cifs_i = CIFS_I(inode);
+	dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
+	data.Attributes = cpu_to_le32(dosattrs);
+	tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
+				   FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+				   CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
+	if (tmprc == 0)
+		cifs_i->cifsAttrs = dosattrs;
+}
+
+int
+smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	   struct cifs_sb_info *cifs_sb)
+{
+	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+				  CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
+				  NULL, SMB2_OP_DELETE);
+}
+
+int
+smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+	    struct cifs_sb_info *cifs_sb)
+{
+	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+				  CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+				  NULL, SMB2_OP_DELETE);
+}
+
+static int
+smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *from_name, const char *to_name,
+		   struct cifs_sb_info *cifs_sb, __u32 access, int command)
+{
+	__le16 *smb2_to_name = NULL;
+	int rc;
+
+	smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb);
+	if (smb2_to_name == NULL) {
+		rc = -ENOMEM;
+		goto smb2_rename_path;
+	}
+
+	rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
+				FILE_OPEN, 0, smb2_to_name, command);
+smb2_rename_path:
+	kfree(smb2_to_name);
+	return rc;
+}
+
+int
+smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
+		 const char *from_name, const char *to_name,
+		 struct cifs_sb_info *cifs_sb)
+{
+	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+				  DELETE, SMB2_OP_RENAME);
+}
+
+int
+smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+		     const char *from_name, const char *to_name,
+		     struct cifs_sb_info *cifs_sb)
+{
+	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+				  FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK);
+}
+
+int
+smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, __u64 size,
+		   struct cifs_sb_info *cifs_sb, bool set_alloc)
+{
+	__le64 eof = cpu_to_le64(size);
+	return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+				  FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
+				  SMB2_OP_SET_EOF);
+}
+
+int
+smb2_set_file_info(struct inode *inode, const char *full_path,
+		   FILE_BASIC_INFO *buf, const unsigned int xid)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
+	int rc;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
+				FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
+				SMB2_OP_SET_INFO);
+	cifs_put_tlink(tlink);
+	return rc;
+}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
new file mode 100644
index 00000000000..94bd4fbb13d
--- /dev/null
+++ b/fs/cifs/smb2maperror.c
@@ -0,0 +1,2479 @@
+/*
+ *   fs/smb2/smb2maperror.c
+ *
+ *   Functions which do error mapping of SMB2 status codes to POSIX errors
+ *
+ *   Copyright (C) International Business Machines  Corp., 2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/errno.h>
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "smb2status.h"
+
+struct status_to_posix_error {
+	__le32 smb2_status;
+	int posix_error;
+	char *status_string;
+};
+
+static const struct status_to_posix_error smb2_error_map_table[] = {
+	{STATUS_SUCCESS, 0, "STATUS_SUCCESS"},
+	{STATUS_WAIT_0,  0, "STATUS_WAIT_0"},
+	{STATUS_WAIT_1, -EIO, "STATUS_WAIT_1"},
+	{STATUS_WAIT_2, -EIO, "STATUS_WAIT_2"},
+	{STATUS_WAIT_3, -EIO, "STATUS_WAIT_3"},
+	{STATUS_WAIT_63, -EIO, "STATUS_WAIT_63"},
+	{STATUS_ABANDONED, -EIO, "STATUS_ABANDONED"},
+	{STATUS_ABANDONED_WAIT_0, -EIO, "STATUS_ABANDONED_WAIT_0"},
+	{STATUS_ABANDONED_WAIT_63, -EIO, "STATUS_ABANDONED_WAIT_63"},
+	{STATUS_USER_APC, -EIO, "STATUS_USER_APC"},
+	{STATUS_KERNEL_APC, -EIO, "STATUS_KERNEL_APC"},
+	{STATUS_ALERTED, -EIO, "STATUS_ALERTED"},
+	{STATUS_TIMEOUT, -ETIMEDOUT, "STATUS_TIMEOUT"},
+	{STATUS_PENDING, -EIO, "STATUS_PENDING"},
+	{STATUS_REPARSE, -EIO, "STATUS_REPARSE"},
+	{STATUS_MORE_ENTRIES, -EIO, "STATUS_MORE_ENTRIES"},
+	{STATUS_NOT_ALL_ASSIGNED, -EIO, "STATUS_NOT_ALL_ASSIGNED"},
+	{STATUS_SOME_NOT_MAPPED, -EIO, "STATUS_SOME_NOT_MAPPED"},
+	{STATUS_OPLOCK_BREAK_IN_PROGRESS, -EIO,
+	"STATUS_OPLOCK_BREAK_IN_PROGRESS"},
+	{STATUS_VOLUME_MOUNTED, -EIO, "STATUS_VOLUME_MOUNTED"},
+	{STATUS_RXACT_COMMITTED, -EIO, "STATUS_RXACT_COMMITTED"},
+	{STATUS_NOTIFY_CLEANUP, -EIO, "STATUS_NOTIFY_CLEANUP"},
+	{STATUS_NOTIFY_ENUM_DIR, -EIO, "STATUS_NOTIFY_ENUM_DIR"},
+	{STATUS_NO_QUOTAS_FOR_ACCOUNT, -EIO, "STATUS_NO_QUOTAS_FOR_ACCOUNT"},
+	{STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED, -EIO,
+	"STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED"},
+	{STATUS_PAGE_FAULT_TRANSITION, -EIO, "STATUS_PAGE_FAULT_TRANSITION"},
+	{STATUS_PAGE_FAULT_DEMAND_ZERO, -EIO, "STATUS_PAGE_FAULT_DEMAND_ZERO"},
+	{STATUS_PAGE_FAULT_COPY_ON_WRITE, -EIO,
+	"STATUS_PAGE_FAULT_COPY_ON_WRITE"},
+	{STATUS_PAGE_FAULT_GUARD_PAGE, -EIO, "STATUS_PAGE_FAULT_GUARD_PAGE"},
+	{STATUS_PAGE_FAULT_PAGING_FILE, -EIO, "STATUS_PAGE_FAULT_PAGING_FILE"},
+	{STATUS_CACHE_PAGE_LOCKED, -EIO, "STATUS_CACHE_PAGE_LOCKED"},
+	{STATUS_CRASH_DUMP, -EIO, "STATUS_CRASH_DUMP"},
+	{STATUS_BUFFER_ALL_ZEROS, -EIO, "STATUS_BUFFER_ALL_ZEROS"},
+	{STATUS_REPARSE_OBJECT, -EIO, "STATUS_REPARSE_OBJECT"},
+	{STATUS_RESOURCE_REQUIREMENTS_CHANGED, -EIO,
+	"STATUS_RESOURCE_REQUIREMENTS_CHANGED"},
+	{STATUS_TRANSLATION_COMPLETE, -EIO, "STATUS_TRANSLATION_COMPLETE"},
+	{STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY, -EIO,
+	"STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY"},
+	{STATUS_NOTHING_TO_TERMINATE, -EIO, "STATUS_NOTHING_TO_TERMINATE"},
+	{STATUS_PROCESS_NOT_IN_JOB, -EIO, "STATUS_PROCESS_NOT_IN_JOB"},
+	{STATUS_PROCESS_IN_JOB, -EIO, "STATUS_PROCESS_IN_JOB"},
+	{STATUS_VOLSNAP_HIBERNATE_READY, -EIO,
+	"STATUS_VOLSNAP_HIBERNATE_READY"},
+	{STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY, -EIO,
+	"STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY"},
+	{STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED, -EIO,
+	"STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED"},
+	{STATUS_INTERRUPT_STILL_CONNECTED, -EIO,
+	"STATUS_INTERRUPT_STILL_CONNECTED"},
+	{STATUS_PROCESS_CLONED, -EIO, "STATUS_PROCESS_CLONED"},
+	{STATUS_FILE_LOCKED_WITH_ONLY_READERS, -EIO,
+	"STATUS_FILE_LOCKED_WITH_ONLY_READERS"},
+	{STATUS_FILE_LOCKED_WITH_WRITERS, -EIO,
+	"STATUS_FILE_LOCKED_WITH_WRITERS"},
+	{STATUS_RESOURCEMANAGER_READ_ONLY, -EROFS,
+	"STATUS_RESOURCEMANAGER_READ_ONLY"},
+	{STATUS_WAIT_FOR_OPLOCK, -EIO, "STATUS_WAIT_FOR_OPLOCK"},
+	{DBG_EXCEPTION_HANDLED, -EIO, "DBG_EXCEPTION_HANDLED"},
+	{DBG_CONTINUE, -EIO, "DBG_CONTINUE"},
+	{STATUS_FLT_IO_COMPLETE, -EIO, "STATUS_FLT_IO_COMPLETE"},
+	{STATUS_OBJECT_NAME_EXISTS, -EIO, "STATUS_OBJECT_NAME_EXISTS"},
+	{STATUS_THREAD_WAS_SUSPENDED, -EIO, "STATUS_THREAD_WAS_SUSPENDED"},
+	{STATUS_WORKING_SET_LIMIT_RANGE, -EIO,
+	"STATUS_WORKING_SET_LIMIT_RANGE"},
+	{STATUS_IMAGE_NOT_AT_BASE, -EIO, "STATUS_IMAGE_NOT_AT_BASE"},
+	{STATUS_RXACT_STATE_CREATED, -EIO, "STATUS_RXACT_STATE_CREATED"},
+	{STATUS_SEGMENT_NOTIFICATION, -EIO, "STATUS_SEGMENT_NOTIFICATION"},
+	{STATUS_LOCAL_USER_SESSION_KEY, -EIO, "STATUS_LOCAL_USER_SESSION_KEY"},
+	{STATUS_BAD_CURRENT_DIRECTORY, -EIO, "STATUS_BAD_CURRENT_DIRECTORY"},
+	{STATUS_SERIAL_MORE_WRITES, -EIO, "STATUS_SERIAL_MORE_WRITES"},
+	{STATUS_REGISTRY_RECOVERED, -EIO, "STATUS_REGISTRY_RECOVERED"},
+	{STATUS_FT_READ_RECOVERY_FROM_BACKUP, -EIO,
+	"STATUS_FT_READ_RECOVERY_FROM_BACKUP"},
+	{STATUS_FT_WRITE_RECOVERY, -EIO, "STATUS_FT_WRITE_RECOVERY"},
+	{STATUS_SERIAL_COUNTER_TIMEOUT, -ETIMEDOUT,
+	"STATUS_SERIAL_COUNTER_TIMEOUT"},
+	{STATUS_NULL_LM_PASSWORD, -EIO, "STATUS_NULL_LM_PASSWORD"},
+	{STATUS_IMAGE_MACHINE_TYPE_MISMATCH, -EIO,
+	"STATUS_IMAGE_MACHINE_TYPE_MISMATCH"},
+	{STATUS_RECEIVE_PARTIAL, -EIO, "STATUS_RECEIVE_PARTIAL"},
+	{STATUS_RECEIVE_EXPEDITED, -EIO, "STATUS_RECEIVE_EXPEDITED"},
+	{STATUS_RECEIVE_PARTIAL_EXPEDITED, -EIO,
+	"STATUS_RECEIVE_PARTIAL_EXPEDITED"},
+	{STATUS_EVENT_DONE, -EIO, "STATUS_EVENT_DONE"},
+	{STATUS_EVENT_PENDING, -EIO, "STATUS_EVENT_PENDING"},
+	{STATUS_CHECKING_FILE_SYSTEM, -EIO, "STATUS_CHECKING_FILE_SYSTEM"},
+	{STATUS_FATAL_APP_EXIT, -EIO, "STATUS_FATAL_APP_EXIT"},
+	{STATUS_PREDEFINED_HANDLE, -EIO, "STATUS_PREDEFINED_HANDLE"},
+	{STATUS_WAS_UNLOCKED, -EIO, "STATUS_WAS_UNLOCKED"},
+	{STATUS_SERVICE_NOTIFICATION, -EIO, "STATUS_SERVICE_NOTIFICATION"},
+	{STATUS_WAS_LOCKED, -EIO, "STATUS_WAS_LOCKED"},
+	{STATUS_LOG_HARD_ERROR, -EIO, "STATUS_LOG_HARD_ERROR"},
+	{STATUS_ALREADY_WIN32, -EIO, "STATUS_ALREADY_WIN32"},
+	{STATUS_WX86_UNSIMULATE, -EIO, "STATUS_WX86_UNSIMULATE"},
+	{STATUS_WX86_CONTINUE, -EIO, "STATUS_WX86_CONTINUE"},
+	{STATUS_WX86_SINGLE_STEP, -EIO, "STATUS_WX86_SINGLE_STEP"},
+	{STATUS_WX86_BREAKPOINT, -EIO, "STATUS_WX86_BREAKPOINT"},
+	{STATUS_WX86_EXCEPTION_CONTINUE, -EIO,
+	"STATUS_WX86_EXCEPTION_CONTINUE"},
+	{STATUS_WX86_EXCEPTION_LASTCHANCE, -EIO,
+	"STATUS_WX86_EXCEPTION_LASTCHANCE"},
+	{STATUS_WX86_EXCEPTION_CHAIN, -EIO, "STATUS_WX86_EXCEPTION_CHAIN"},
+	{STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE, -EIO,
+	"STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE"},
+	{STATUS_NO_YIELD_PERFORMED, -EIO, "STATUS_NO_YIELD_PERFORMED"},
+	{STATUS_TIMER_RESUME_IGNORED, -EIO, "STATUS_TIMER_RESUME_IGNORED"},
+	{STATUS_ARBITRATION_UNHANDLED, -EIO, "STATUS_ARBITRATION_UNHANDLED"},
+	{STATUS_CARDBUS_NOT_SUPPORTED, -ENOSYS, "STATUS_CARDBUS_NOT_SUPPORTED"},
+	{STATUS_WX86_CREATEWX86TIB, -EIO, "STATUS_WX86_CREATEWX86TIB"},
+	{STATUS_MP_PROCESSOR_MISMATCH, -EIO, "STATUS_MP_PROCESSOR_MISMATCH"},
+	{STATUS_HIBERNATED, -EIO, "STATUS_HIBERNATED"},
+	{STATUS_RESUME_HIBERNATION, -EIO, "STATUS_RESUME_HIBERNATION"},
+	{STATUS_FIRMWARE_UPDATED, -EIO, "STATUS_FIRMWARE_UPDATED"},
+	{STATUS_DRIVERS_LEAKING_LOCKED_PAGES, -EIO,
+	"STATUS_DRIVERS_LEAKING_LOCKED_PAGES"},
+	{STATUS_MESSAGE_RETRIEVED, -EIO, "STATUS_MESSAGE_RETRIEVED"},
+	{STATUS_SYSTEM_POWERSTATE_TRANSITION, -EIO,
+	"STATUS_SYSTEM_POWERSTATE_TRANSITION"},
+	{STATUS_ALPC_CHECK_COMPLETION_LIST, -EIO,
+	"STATUS_ALPC_CHECK_COMPLETION_LIST"},
+	{STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION, -EIO,
+	"STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION"},
+	{STATUS_ACCESS_AUDIT_BY_POLICY, -EIO, "STATUS_ACCESS_AUDIT_BY_POLICY"},
+	{STATUS_ABANDON_HIBERFILE, -EIO, "STATUS_ABANDON_HIBERFILE"},
+	{STATUS_BIZRULES_NOT_ENABLED, -EIO, "STATUS_BIZRULES_NOT_ENABLED"},
+	{STATUS_WAKE_SYSTEM, -EIO, "STATUS_WAKE_SYSTEM"},
+	{STATUS_DS_SHUTTING_DOWN, -EIO, "STATUS_DS_SHUTTING_DOWN"},
+	{DBG_REPLY_LATER, -EIO, "DBG_REPLY_LATER"},
+	{DBG_UNABLE_TO_PROVIDE_HANDLE, -EIO, "DBG_UNABLE_TO_PROVIDE_HANDLE"},
+	{DBG_TERMINATE_THREAD, -EIO, "DBG_TERMINATE_THREAD"},
+	{DBG_TERMINATE_PROCESS, -EIO, "DBG_TERMINATE_PROCESS"},
+	{DBG_CONTROL_C, -EIO, "DBG_CONTROL_C"},
+	{DBG_PRINTEXCEPTION_C, -EIO, "DBG_PRINTEXCEPTION_C"},
+	{DBG_RIPEXCEPTION, -EIO, "DBG_RIPEXCEPTION"},
+	{DBG_CONTROL_BREAK, -EIO, "DBG_CONTROL_BREAK"},
+	{DBG_COMMAND_EXCEPTION, -EIO, "DBG_COMMAND_EXCEPTION"},
+	{RPC_NT_UUID_LOCAL_ONLY, -EIO, "RPC_NT_UUID_LOCAL_ONLY"},
+	{RPC_NT_SEND_INCOMPLETE, -EIO, "RPC_NT_SEND_INCOMPLETE"},
+	{STATUS_CTX_CDM_CONNECT, -EIO, "STATUS_CTX_CDM_CONNECT"},
+	{STATUS_CTX_CDM_DISCONNECT, -EIO, "STATUS_CTX_CDM_DISCONNECT"},
+	{STATUS_SXS_RELEASE_ACTIVATION_CONTEXT, -EIO,
+	"STATUS_SXS_RELEASE_ACTIVATION_CONTEXT"},
+	{STATUS_RECOVERY_NOT_NEEDED, -EIO, "STATUS_RECOVERY_NOT_NEEDED"},
+	{STATUS_RM_ALREADY_STARTED, -EIO, "STATUS_RM_ALREADY_STARTED"},
+	{STATUS_LOG_NO_RESTART, -EIO, "STATUS_LOG_NO_RESTART"},
+	{STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST, -EIO,
+	"STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST"},
+	{STATUS_GRAPHICS_PARTIAL_DATA_POPULATED, -EIO,
+	"STATUS_GRAPHICS_PARTIAL_DATA_POPULATED"},
+	{STATUS_GRAPHICS_DRIVER_MISMATCH, -EIO,
+	"STATUS_GRAPHICS_DRIVER_MISMATCH"},
+	{STATUS_GRAPHICS_MODE_NOT_PINNED, -EIO,
+	"STATUS_GRAPHICS_MODE_NOT_PINNED"},
+	{STATUS_GRAPHICS_NO_PREFERRED_MODE, -EIO,
+	"STATUS_GRAPHICS_NO_PREFERRED_MODE"},
+	{STATUS_GRAPHICS_DATASET_IS_EMPTY, -EIO,
+	"STATUS_GRAPHICS_DATASET_IS_EMPTY"},
+	{STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET, -EIO,
+	"STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET"},
+	{STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED, -EIO,
+	"STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED"},
+	{STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS, -EIO,
+	"STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS"},
+	{STATUS_GRAPHICS_LEADLINK_START_DEFERRED, -EIO,
+	"STATUS_GRAPHICS_LEADLINK_START_DEFERRED"},
+	{STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY, -EIO,
+	"STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY"},
+	{STATUS_GRAPHICS_START_DEFERRED, -EIO,
+	"STATUS_GRAPHICS_START_DEFERRED"},
+	{STATUS_NDIS_INDICATION_REQUIRED, -EIO,
+	"STATUS_NDIS_INDICATION_REQUIRED"},
+	{STATUS_GUARD_PAGE_VIOLATION, -EIO, "STATUS_GUARD_PAGE_VIOLATION"},
+	{STATUS_DATATYPE_MISALIGNMENT, -EIO, "STATUS_DATATYPE_MISALIGNMENT"},
+	{STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
+	{STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
+	{STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
+	{STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
+	{STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
+	{STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
+	{STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
+	{STATUS_GUID_SUBSTITUTION_MADE, -EIO, "STATUS_GUID_SUBSTITUTION_MADE"},
+	{STATUS_PARTIAL_COPY, -EIO, "STATUS_PARTIAL_COPY"},
+	{STATUS_DEVICE_PAPER_EMPTY, -EIO, "STATUS_DEVICE_PAPER_EMPTY"},
+	{STATUS_DEVICE_POWERED_OFF, -EIO, "STATUS_DEVICE_POWERED_OFF"},
+	{STATUS_DEVICE_OFF_LINE, -EIO, "STATUS_DEVICE_OFF_LINE"},
+	{STATUS_DEVICE_BUSY, -EBUSY, "STATUS_DEVICE_BUSY"},
+	{STATUS_NO_MORE_EAS, -EIO, "STATUS_NO_MORE_EAS"},
+	{STATUS_INVALID_EA_NAME, -EINVAL, "STATUS_INVALID_EA_NAME"},
+	{STATUS_EA_LIST_INCONSISTENT, -EIO, "STATUS_EA_LIST_INCONSISTENT"},
+	{STATUS_INVALID_EA_FLAG, -EINVAL, "STATUS_INVALID_EA_FLAG"},
+	{STATUS_VERIFY_REQUIRED, -EIO, "STATUS_VERIFY_REQUIRED"},
+	{STATUS_EXTRANEOUS_INFORMATION, -EIO, "STATUS_EXTRANEOUS_INFORMATION"},
+	{STATUS_RXACT_COMMIT_NECESSARY, -EIO, "STATUS_RXACT_COMMIT_NECESSARY"},
+	{STATUS_NO_MORE_ENTRIES, -EIO, "STATUS_NO_MORE_ENTRIES"},
+	{STATUS_FILEMARK_DETECTED, -EIO, "STATUS_FILEMARK_DETECTED"},
+	{STATUS_MEDIA_CHANGED, -EIO, "STATUS_MEDIA_CHANGED"},
+	{STATUS_BUS_RESET, -EIO, "STATUS_BUS_RESET"},
+	{STATUS_END_OF_MEDIA, -EIO, "STATUS_END_OF_MEDIA"},
+	{STATUS_BEGINNING_OF_MEDIA, -EIO, "STATUS_BEGINNING_OF_MEDIA"},
+	{STATUS_MEDIA_CHECK, -EIO, "STATUS_MEDIA_CHECK"},
+	{STATUS_SETMARK_DETECTED, -EIO, "STATUS_SETMARK_DETECTED"},
+	{STATUS_NO_DATA_DETECTED, -EIO, "STATUS_NO_DATA_DETECTED"},
+	{STATUS_REDIRECTOR_HAS_OPEN_HANDLES, -EIO,
+	"STATUS_REDIRECTOR_HAS_OPEN_HANDLES"},
+	{STATUS_SERVER_HAS_OPEN_HANDLES, -EIO,
+	"STATUS_SERVER_HAS_OPEN_HANDLES"},
+	{STATUS_ALREADY_DISCONNECTED, -EIO, "STATUS_ALREADY_DISCONNECTED"},
+	{STATUS_LONGJUMP, -EIO, "STATUS_LONGJUMP"},
+	{STATUS_CLEANER_CARTRIDGE_INSTALLED, -EIO,
+	"STATUS_CLEANER_CARTRIDGE_INSTALLED"},
+	{STATUS_PLUGPLAY_QUERY_VETOED, -EIO, "STATUS_PLUGPLAY_QUERY_VETOED"},
+	{STATUS_UNWIND_CONSOLIDATE, -EIO, "STATUS_UNWIND_CONSOLIDATE"},
+	{STATUS_REGISTRY_HIVE_RECOVERED, -EIO,
+	"STATUS_REGISTRY_HIVE_RECOVERED"},
+	{STATUS_DLL_MIGHT_BE_INSECURE, -EIO, "STATUS_DLL_MIGHT_BE_INSECURE"},
+	{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
+	"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
+	{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+	{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
+	"STATUS_DEVICE_REQUIRES_CLEANING"},
+	{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
+	{STATUS_DATA_LOST_REPAIR, -EIO, "STATUS_DATA_LOST_REPAIR"},
+	{DBG_EXCEPTION_NOT_HANDLED, -EIO, "DBG_EXCEPTION_NOT_HANDLED"},
+	{STATUS_CLUSTER_NODE_ALREADY_UP, -EIO,
+	"STATUS_CLUSTER_NODE_ALREADY_UP"},
+	{STATUS_CLUSTER_NODE_ALREADY_DOWN, -EIO,
+	"STATUS_CLUSTER_NODE_ALREADY_DOWN"},
+	{STATUS_CLUSTER_NETWORK_ALREADY_ONLINE, -EIO,
+	"STATUS_CLUSTER_NETWORK_ALREADY_ONLINE"},
+	{STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE, -EIO,
+	"STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE"},
+	{STATUS_CLUSTER_NODE_ALREADY_MEMBER, -EIO,
+	"STATUS_CLUSTER_NODE_ALREADY_MEMBER"},
+	{STATUS_COULD_NOT_RESIZE_LOG, -EIO, "STATUS_COULD_NOT_RESIZE_LOG"},
+	{STATUS_NO_TXF_METADATA, -EIO, "STATUS_NO_TXF_METADATA"},
+	{STATUS_CANT_RECOVER_WITH_HANDLE_OPEN, -EIO,
+	"STATUS_CANT_RECOVER_WITH_HANDLE_OPEN"},
+	{STATUS_TXF_METADATA_ALREADY_PRESENT, -EIO,
+	"STATUS_TXF_METADATA_ALREADY_PRESENT"},
+	{STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET, -EIO,
+	"STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET"},
+	{STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED, -EIO,
+	"STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED"},
+	{STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"},
+	{STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"},
+	{STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"},
+	{STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"},
+	{STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"},
+	{STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"},
+	{STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"},
+	{STATUS_IN_PAGE_ERROR, -EFAULT, "STATUS_IN_PAGE_ERROR"},
+	{STATUS_PAGEFILE_QUOTA, -EDQUOT, "STATUS_PAGEFILE_QUOTA"},
+	{STATUS_INVALID_HANDLE, -EBADF, "STATUS_INVALID_HANDLE"},
+	{STATUS_BAD_INITIAL_STACK, -EIO, "STATUS_BAD_INITIAL_STACK"},
+	{STATUS_BAD_INITIAL_PC, -EIO, "STATUS_BAD_INITIAL_PC"},
+	{STATUS_INVALID_CID, -EIO, "STATUS_INVALID_CID"},
+	{STATUS_TIMER_NOT_CANCELED, -EIO, "STATUS_TIMER_NOT_CANCELED"},
+	{STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
+	{STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
+	{STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
+	{STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"},
+	{STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
+	{STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
+	{STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
+	{STATUS_UNRECOGNIZED_MEDIA, -EIO, "STATUS_UNRECOGNIZED_MEDIA"},
+	{STATUS_NONEXISTENT_SECTOR, -EIO, "STATUS_NONEXISTENT_SECTOR"},
+	{STATUS_MORE_PROCESSING_REQUIRED, -EIO,
+	"STATUS_MORE_PROCESSING_REQUIRED"},
+	{STATUS_NO_MEMORY, -EREMOTEIO, "STATUS_NO_MEMORY"},
+	{STATUS_CONFLICTING_ADDRESSES, -EADDRINUSE,
+	"STATUS_CONFLICTING_ADDRESSES"},
+	{STATUS_NOT_MAPPED_VIEW, -EIO, "STATUS_NOT_MAPPED_VIEW"},
+	{STATUS_UNABLE_TO_FREE_VM, -EIO, "STATUS_UNABLE_TO_FREE_VM"},
+	{STATUS_UNABLE_TO_DELETE_SECTION, -EIO,
+	"STATUS_UNABLE_TO_DELETE_SECTION"},
+	{STATUS_INVALID_SYSTEM_SERVICE, -EIO, "STATUS_INVALID_SYSTEM_SERVICE"},
+	{STATUS_ILLEGAL_INSTRUCTION, -EIO, "STATUS_ILLEGAL_INSTRUCTION"},
+	{STATUS_INVALID_LOCK_SEQUENCE, -EIO, "STATUS_INVALID_LOCK_SEQUENCE"},
+	{STATUS_INVALID_VIEW_SIZE, -EIO, "STATUS_INVALID_VIEW_SIZE"},
+	{STATUS_INVALID_FILE_FOR_SECTION, -EIO,
+	"STATUS_INVALID_FILE_FOR_SECTION"},
+	{STATUS_ALREADY_COMMITTED, -EIO, "STATUS_ALREADY_COMMITTED"},
+	{STATUS_ACCESS_DENIED, -EACCES, "STATUS_ACCESS_DENIED"},
+	{STATUS_BUFFER_TOO_SMALL, -EIO, "STATUS_BUFFER_TOO_SMALL"},
+	{STATUS_OBJECT_TYPE_MISMATCH, -EIO, "STATUS_OBJECT_TYPE_MISMATCH"},
+	{STATUS_NONCONTINUABLE_EXCEPTION, -EIO,
+	"STATUS_NONCONTINUABLE_EXCEPTION"},
+	{STATUS_INVALID_DISPOSITION, -EIO, "STATUS_INVALID_DISPOSITION"},
+	{STATUS_UNWIND, -EIO, "STATUS_UNWIND"},
+	{STATUS_BAD_STACK, -EIO, "STATUS_BAD_STACK"},
+	{STATUS_INVALID_UNWIND_TARGET, -EIO, "STATUS_INVALID_UNWIND_TARGET"},
+	{STATUS_NOT_LOCKED, -EIO, "STATUS_NOT_LOCKED"},
+	{STATUS_PARITY_ERROR, -EIO, "STATUS_PARITY_ERROR"},
+	{STATUS_UNABLE_TO_DECOMMIT_VM, -EIO, "STATUS_UNABLE_TO_DECOMMIT_VM"},
+	{STATUS_NOT_COMMITTED, -EIO, "STATUS_NOT_COMMITTED"},
+	{STATUS_INVALID_PORT_ATTRIBUTES, -EIO,
+	"STATUS_INVALID_PORT_ATTRIBUTES"},
+	{STATUS_PORT_MESSAGE_TOO_LONG, -EIO, "STATUS_PORT_MESSAGE_TOO_LONG"},
+	{STATUS_INVALID_PARAMETER_MIX, -EINVAL, "STATUS_INVALID_PARAMETER_MIX"},
+	{STATUS_INVALID_QUOTA_LOWER, -EIO, "STATUS_INVALID_QUOTA_LOWER"},
+	{STATUS_DISK_CORRUPT_ERROR, -EIO, "STATUS_DISK_CORRUPT_ERROR"},
+	{STATUS_OBJECT_NAME_INVALID, -ENOENT, "STATUS_OBJECT_NAME_INVALID"},
+	{STATUS_OBJECT_NAME_NOT_FOUND, -ENOENT, "STATUS_OBJECT_NAME_NOT_FOUND"},
+	{STATUS_OBJECT_NAME_COLLISION, -EEXIST, "STATUS_OBJECT_NAME_COLLISION"},
+	{STATUS_PORT_DISCONNECTED, -EIO, "STATUS_PORT_DISCONNECTED"},
+	{STATUS_DEVICE_ALREADY_ATTACHED, -EIO,
+	"STATUS_DEVICE_ALREADY_ATTACHED"},
+	{STATUS_OBJECT_PATH_INVALID, -ENOTDIR, "STATUS_OBJECT_PATH_INVALID"},
+	{STATUS_OBJECT_PATH_NOT_FOUND, -ENOENT, "STATUS_OBJECT_PATH_NOT_FOUND"},
+	{STATUS_OBJECT_PATH_SYNTAX_BAD, -EIO, "STATUS_OBJECT_PATH_SYNTAX_BAD"},
+	{STATUS_DATA_OVERRUN, -EIO, "STATUS_DATA_OVERRUN"},
+	{STATUS_DATA_LATE_ERROR, -EIO, "STATUS_DATA_LATE_ERROR"},
+	{STATUS_DATA_ERROR, -EIO, "STATUS_DATA_ERROR"},
+	{STATUS_CRC_ERROR, -EIO, "STATUS_CRC_ERROR"},
+	{STATUS_SECTION_TOO_BIG, -EIO, "STATUS_SECTION_TOO_BIG"},
+	{STATUS_PORT_CONNECTION_REFUSED, -ECONNREFUSED,
+	"STATUS_PORT_CONNECTION_REFUSED"},
+	{STATUS_INVALID_PORT_HANDLE, -EIO, "STATUS_INVALID_PORT_HANDLE"},
+	{STATUS_SHARING_VIOLATION, -EBUSY, "STATUS_SHARING_VIOLATION"},
+	{STATUS_QUOTA_EXCEEDED, -EDQUOT, "STATUS_QUOTA_EXCEEDED"},
+	{STATUS_INVALID_PAGE_PROTECTION, -EIO,
+	"STATUS_INVALID_PAGE_PROTECTION"},
+	{STATUS_MUTANT_NOT_OWNED, -EIO, "STATUS_MUTANT_NOT_OWNED"},
+	{STATUS_SEMAPHORE_LIMIT_EXCEEDED, -EIO,
+	"STATUS_SEMAPHORE_LIMIT_EXCEEDED"},
+	{STATUS_PORT_ALREADY_SET, -EIO, "STATUS_PORT_ALREADY_SET"},
+	{STATUS_SECTION_NOT_IMAGE, -EIO, "STATUS_SECTION_NOT_IMAGE"},
+	{STATUS_SUSPEND_COUNT_EXCEEDED, -EIO, "STATUS_SUSPEND_COUNT_EXCEEDED"},
+	{STATUS_THREAD_IS_TERMINATING, -EIO, "STATUS_THREAD_IS_TERMINATING"},
+	{STATUS_BAD_WORKING_SET_LIMIT, -EIO, "STATUS_BAD_WORKING_SET_LIMIT"},
+	{STATUS_INCOMPATIBLE_FILE_MAP, -EIO, "STATUS_INCOMPATIBLE_FILE_MAP"},
+	{STATUS_SECTION_PROTECTION, -EIO, "STATUS_SECTION_PROTECTION"},
+	{STATUS_EAS_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_EAS_NOT_SUPPORTED"},
+	{STATUS_EA_TOO_LARGE, -EIO, "STATUS_EA_TOO_LARGE"},
+	{STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"},
+	{STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"},
+	{STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"},
+	{STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"},
+	{STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"},
+	{STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"},
+	{STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS,
+	"STATUS_CTL_FILE_NOT_SUPPORTED"},
+	{STATUS_UNKNOWN_REVISION, -EIO, "STATUS_UNKNOWN_REVISION"},
+	{STATUS_REVISION_MISMATCH, -EIO, "STATUS_REVISION_MISMATCH"},
+	{STATUS_INVALID_OWNER, -EIO, "STATUS_INVALID_OWNER"},
+	{STATUS_INVALID_PRIMARY_GROUP, -EIO, "STATUS_INVALID_PRIMARY_GROUP"},
+	{STATUS_NO_IMPERSONATION_TOKEN, -EIO, "STATUS_NO_IMPERSONATION_TOKEN"},
+	{STATUS_CANT_DISABLE_MANDATORY, -EIO, "STATUS_CANT_DISABLE_MANDATORY"},
+	{STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"},
+	{STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"},
+	{STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"},
+	{STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"},
+	{STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"},
+	{STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"},
+	{STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"},
+	{STATUS_GROUP_EXISTS, -EIO, "STATUS_GROUP_EXISTS"},
+	{STATUS_NO_SUCH_GROUP, -EIO, "STATUS_NO_SUCH_GROUP"},
+	{STATUS_MEMBER_IN_GROUP, -EIO, "STATUS_MEMBER_IN_GROUP"},
+	{STATUS_MEMBER_NOT_IN_GROUP, -EIO, "STATUS_MEMBER_NOT_IN_GROUP"},
+	{STATUS_LAST_ADMIN, -EIO, "STATUS_LAST_ADMIN"},
+	{STATUS_WRONG_PASSWORD, -EACCES, "STATUS_WRONG_PASSWORD"},
+	{STATUS_ILL_FORMED_PASSWORD, -EINVAL, "STATUS_ILL_FORMED_PASSWORD"},
+	{STATUS_PASSWORD_RESTRICTION, -EACCES, "STATUS_PASSWORD_RESTRICTION"},
+	{STATUS_LOGON_FAILURE, -EACCES, "STATUS_LOGON_FAILURE"},
+	{STATUS_ACCOUNT_RESTRICTION, -EACCES, "STATUS_ACCOUNT_RESTRICTION"},
+	{STATUS_INVALID_LOGON_HOURS, -EACCES, "STATUS_INVALID_LOGON_HOURS"},
+	{STATUS_INVALID_WORKSTATION, -EACCES, "STATUS_INVALID_WORKSTATION"},
+	{STATUS_PASSWORD_EXPIRED, -EKEYEXPIRED, "STATUS_PASSWORD_EXPIRED"},
+	{STATUS_ACCOUNT_DISABLED, -EKEYREVOKED, "STATUS_ACCOUNT_DISABLED"},
+	{STATUS_NONE_MAPPED, -EIO, "STATUS_NONE_MAPPED"},
+	{STATUS_TOO_MANY_LUIDS_REQUESTED, -EIO,
+	"STATUS_TOO_MANY_LUIDS_REQUESTED"},
+	{STATUS_LUIDS_EXHAUSTED, -EIO, "STATUS_LUIDS_EXHAUSTED"},
+	{STATUS_INVALID_SUB_AUTHORITY, -EIO, "STATUS_INVALID_SUB_AUTHORITY"},
+	{STATUS_INVALID_ACL, -EIO, "STATUS_INVALID_ACL"},
+	{STATUS_INVALID_SID, -EIO, "STATUS_INVALID_SID"},
+	{STATUS_INVALID_SECURITY_DESCR, -EIO, "STATUS_INVALID_SECURITY_DESCR"},
+	{STATUS_PROCEDURE_NOT_FOUND, -EIO, "STATUS_PROCEDURE_NOT_FOUND"},
+	{STATUS_INVALID_IMAGE_FORMAT, -EIO, "STATUS_INVALID_IMAGE_FORMAT"},
+	{STATUS_NO_TOKEN, -EIO, "STATUS_NO_TOKEN"},
+	{STATUS_BAD_INHERITANCE_ACL, -EIO, "STATUS_BAD_INHERITANCE_ACL"},
+	{STATUS_RANGE_NOT_LOCKED, -EIO, "STATUS_RANGE_NOT_LOCKED"},
+	{STATUS_DISK_FULL, -ENOSPC, "STATUS_DISK_FULL"},
+	{STATUS_SERVER_DISABLED, -EIO, "STATUS_SERVER_DISABLED"},
+	{STATUS_SERVER_NOT_DISABLED, -EIO, "STATUS_SERVER_NOT_DISABLED"},
+	{STATUS_TOO_MANY_GUIDS_REQUESTED, -EIO,
+	"STATUS_TOO_MANY_GUIDS_REQUESTED"},
+	{STATUS_GUIDS_EXHAUSTED, -EIO, "STATUS_GUIDS_EXHAUSTED"},
+	{STATUS_INVALID_ID_AUTHORITY, -EIO, "STATUS_INVALID_ID_AUTHORITY"},
+	{STATUS_AGENTS_EXHAUSTED, -EIO, "STATUS_AGENTS_EXHAUSTED"},
+	{STATUS_INVALID_VOLUME_LABEL, -EIO, "STATUS_INVALID_VOLUME_LABEL"},
+	{STATUS_SECTION_NOT_EXTENDED, -EIO, "STATUS_SECTION_NOT_EXTENDED"},
+	{STATUS_NOT_MAPPED_DATA, -EIO, "STATUS_NOT_MAPPED_DATA"},
+	{STATUS_RESOURCE_DATA_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_DATA_NOT_FOUND"},
+	{STATUS_RESOURCE_TYPE_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_TYPE_NOT_FOUND"},
+	{STATUS_RESOURCE_NAME_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_NAME_NOT_FOUND"},
+	{STATUS_ARRAY_BOUNDS_EXCEEDED, -EIO, "STATUS_ARRAY_BOUNDS_EXCEEDED"},
+	{STATUS_FLOAT_DENORMAL_OPERAND, -EIO, "STATUS_FLOAT_DENORMAL_OPERAND"},
+	{STATUS_FLOAT_DIVIDE_BY_ZERO, -EIO, "STATUS_FLOAT_DIVIDE_BY_ZERO"},
+	{STATUS_FLOAT_INEXACT_RESULT, -EIO, "STATUS_FLOAT_INEXACT_RESULT"},
+	{STATUS_FLOAT_INVALID_OPERATION, -EIO,
+	"STATUS_FLOAT_INVALID_OPERATION"},
+	{STATUS_FLOAT_OVERFLOW, -EIO, "STATUS_FLOAT_OVERFLOW"},
+	{STATUS_FLOAT_STACK_CHECK, -EIO, "STATUS_FLOAT_STACK_CHECK"},
+	{STATUS_FLOAT_UNDERFLOW, -EIO, "STATUS_FLOAT_UNDERFLOW"},
+	{STATUS_INTEGER_DIVIDE_BY_ZERO, -EIO, "STATUS_INTEGER_DIVIDE_BY_ZERO"},
+	{STATUS_INTEGER_OVERFLOW, -EIO, "STATUS_INTEGER_OVERFLOW"},
+	{STATUS_PRIVILEGED_INSTRUCTION, -EIO, "STATUS_PRIVILEGED_INSTRUCTION"},
+	{STATUS_TOO_MANY_PAGING_FILES, -EIO, "STATUS_TOO_MANY_PAGING_FILES"},
+	{STATUS_FILE_INVALID, -EIO, "STATUS_FILE_INVALID"},
+	{STATUS_ALLOTTED_SPACE_EXCEEDED, -EIO,
+	"STATUS_ALLOTTED_SPACE_EXCEEDED"},
+	{STATUS_INSUFFICIENT_RESOURCES, -EREMOTEIO,
+				"STATUS_INSUFFICIENT_RESOURCES"},
+	{STATUS_DFS_EXIT_PATH_FOUND, -EIO, "STATUS_DFS_EXIT_PATH_FOUND"},
+	{STATUS_DEVICE_DATA_ERROR, -EIO, "STATUS_DEVICE_DATA_ERROR"},
+	{STATUS_DEVICE_NOT_CONNECTED, -EIO, "STATUS_DEVICE_NOT_CONNECTED"},
+	{STATUS_DEVICE_POWER_FAILURE, -EIO, "STATUS_DEVICE_POWER_FAILURE"},
+	{STATUS_FREE_VM_NOT_AT_BASE, -EIO, "STATUS_FREE_VM_NOT_AT_BASE"},
+	{STATUS_MEMORY_NOT_ALLOCATED, -EFAULT, "STATUS_MEMORY_NOT_ALLOCATED"},
+	{STATUS_WORKING_SET_QUOTA, -EIO, "STATUS_WORKING_SET_QUOTA"},
+	{STATUS_MEDIA_WRITE_PROTECTED, -EROFS, "STATUS_MEDIA_WRITE_PROTECTED"},
+	{STATUS_DEVICE_NOT_READY, -EIO, "STATUS_DEVICE_NOT_READY"},
+	{STATUS_INVALID_GROUP_ATTRIBUTES, -EIO,
+	"STATUS_INVALID_GROUP_ATTRIBUTES"},
+	{STATUS_BAD_IMPERSONATION_LEVEL, -EIO,
+	"STATUS_BAD_IMPERSONATION_LEVEL"},
+	{STATUS_CANT_OPEN_ANONYMOUS, -EIO, "STATUS_CANT_OPEN_ANONYMOUS"},
+	{STATUS_BAD_VALIDATION_CLASS, -EIO, "STATUS_BAD_VALIDATION_CLASS"},
+	{STATUS_BAD_TOKEN_TYPE, -EIO, "STATUS_BAD_TOKEN_TYPE"},
+	{STATUS_BAD_MASTER_BOOT_RECORD, -EIO, "STATUS_BAD_MASTER_BOOT_RECORD"},
+	{STATUS_INSTRUCTION_MISALIGNMENT, -EIO,
+	"STATUS_INSTRUCTION_MISALIGNMENT"},
+	{STATUS_INSTANCE_NOT_AVAILABLE, -EIO, "STATUS_INSTANCE_NOT_AVAILABLE"},
+	{STATUS_PIPE_NOT_AVAILABLE, -EIO, "STATUS_PIPE_NOT_AVAILABLE"},
+	{STATUS_INVALID_PIPE_STATE, -EIO, "STATUS_INVALID_PIPE_STATE"},
+	{STATUS_PIPE_BUSY, -EBUSY, "STATUS_PIPE_BUSY"},
+	{STATUS_ILLEGAL_FUNCTION, -EIO, "STATUS_ILLEGAL_FUNCTION"},
+	{STATUS_PIPE_DISCONNECTED, -EPIPE, "STATUS_PIPE_DISCONNECTED"},
+	{STATUS_PIPE_CLOSING, -EIO, "STATUS_PIPE_CLOSING"},
+	{STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"},
+	{STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"},
+	{STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"},
+	{STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"},
+	{STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"},
+	{STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"},
+	{STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"},
+	{STATUS_COULD_NOT_INTERPRET, -EIO, "STATUS_COULD_NOT_INTERPRET"},
+	{STATUS_FILE_IS_A_DIRECTORY, -EISDIR, "STATUS_FILE_IS_A_DIRECTORY"},
+	{STATUS_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_NOT_SUPPORTED"},
+	{STATUS_REMOTE_NOT_LISTENING, -EHOSTDOWN,
+	"STATUS_REMOTE_NOT_LISTENING"},
+	{STATUS_DUPLICATE_NAME, -ENOTUNIQ, "STATUS_DUPLICATE_NAME"},
+	{STATUS_BAD_NETWORK_PATH, -EINVAL, "STATUS_BAD_NETWORK_PATH"},
+	{STATUS_NETWORK_BUSY, -EBUSY, "STATUS_NETWORK_BUSY"},
+	{STATUS_DEVICE_DOES_NOT_EXIST, -ENODEV, "STATUS_DEVICE_DOES_NOT_EXIST"},
+	{STATUS_TOO_MANY_COMMANDS, -EIO, "STATUS_TOO_MANY_COMMANDS"},
+	{STATUS_ADAPTER_HARDWARE_ERROR, -EIO, "STATUS_ADAPTER_HARDWARE_ERROR"},
+	{STATUS_INVALID_NETWORK_RESPONSE, -EIO,
+	"STATUS_INVALID_NETWORK_RESPONSE"},
+	{STATUS_UNEXPECTED_NETWORK_ERROR, -EIO,
+	"STATUS_UNEXPECTED_NETWORK_ERROR"},
+	{STATUS_BAD_REMOTE_ADAPTER, -EIO, "STATUS_BAD_REMOTE_ADAPTER"},
+	{STATUS_PRINT_QUEUE_FULL, -EIO, "STATUS_PRINT_QUEUE_FULL"},
+	{STATUS_NO_SPOOL_SPACE, -EIO, "STATUS_NO_SPOOL_SPACE"},
+	{STATUS_PRINT_CANCELLED, -EIO, "STATUS_PRINT_CANCELLED"},
+	{STATUS_NETWORK_NAME_DELETED, -EIO, "STATUS_NETWORK_NAME_DELETED"},
+	{STATUS_NETWORK_ACCESS_DENIED, -EACCES, "STATUS_NETWORK_ACCESS_DENIED"},
+	{STATUS_BAD_DEVICE_TYPE, -EIO, "STATUS_BAD_DEVICE_TYPE"},
+	{STATUS_BAD_NETWORK_NAME, -ENOENT, "STATUS_BAD_NETWORK_NAME"},
+	{STATUS_TOO_MANY_NAMES, -EIO, "STATUS_TOO_MANY_NAMES"},
+	{STATUS_TOO_MANY_SESSIONS, -EIO, "STATUS_TOO_MANY_SESSIONS"},
+	{STATUS_SHARING_PAUSED, -EIO, "STATUS_SHARING_PAUSED"},
+	{STATUS_REQUEST_NOT_ACCEPTED, -EIO, "STATUS_REQUEST_NOT_ACCEPTED"},
+	{STATUS_REDIRECTOR_PAUSED, -EIO, "STATUS_REDIRECTOR_PAUSED"},
+	{STATUS_NET_WRITE_FAULT, -EIO, "STATUS_NET_WRITE_FAULT"},
+	{STATUS_PROFILING_AT_LIMIT, -EIO, "STATUS_PROFILING_AT_LIMIT"},
+	{STATUS_NOT_SAME_DEVICE, -EXDEV, "STATUS_NOT_SAME_DEVICE"},
+	{STATUS_FILE_RENAMED, -EIO, "STATUS_FILE_RENAMED"},
+	{STATUS_VIRTUAL_CIRCUIT_CLOSED, -EIO, "STATUS_VIRTUAL_CIRCUIT_CLOSED"},
+	{STATUS_NO_SECURITY_ON_OBJECT, -EIO, "STATUS_NO_SECURITY_ON_OBJECT"},
+	{STATUS_CANT_WAIT, -EIO, "STATUS_CANT_WAIT"},
+	{STATUS_PIPE_EMPTY, -EIO, "STATUS_PIPE_EMPTY"},
+	{STATUS_CANT_ACCESS_DOMAIN_INFO, -EIO,
+	"STATUS_CANT_ACCESS_DOMAIN_INFO"},
+	{STATUS_CANT_TERMINATE_SELF, -EIO, "STATUS_CANT_TERMINATE_SELF"},
+	{STATUS_INVALID_SERVER_STATE, -EIO, "STATUS_INVALID_SERVER_STATE"},
+	{STATUS_INVALID_DOMAIN_STATE, -EIO, "STATUS_INVALID_DOMAIN_STATE"},
+	{STATUS_INVALID_DOMAIN_ROLE, -EIO, "STATUS_INVALID_DOMAIN_ROLE"},
+	{STATUS_NO_SUCH_DOMAIN, -EIO, "STATUS_NO_SUCH_DOMAIN"},
+	{STATUS_DOMAIN_EXISTS, -EIO, "STATUS_DOMAIN_EXISTS"},
+	{STATUS_DOMAIN_LIMIT_EXCEEDED, -EIO, "STATUS_DOMAIN_LIMIT_EXCEEDED"},
+	{STATUS_OPLOCK_NOT_GRANTED, -EIO, "STATUS_OPLOCK_NOT_GRANTED"},
+	{STATUS_INVALID_OPLOCK_PROTOCOL, -EIO,
+	"STATUS_INVALID_OPLOCK_PROTOCOL"},
+	{STATUS_INTERNAL_DB_CORRUPTION, -EIO, "STATUS_INTERNAL_DB_CORRUPTION"},
+	{STATUS_INTERNAL_ERROR, -EIO, "STATUS_INTERNAL_ERROR"},
+	{STATUS_GENERIC_NOT_MAPPED, -EIO, "STATUS_GENERIC_NOT_MAPPED"},
+	{STATUS_BAD_DESCRIPTOR_FORMAT, -EIO, "STATUS_BAD_DESCRIPTOR_FORMAT"},
+	{STATUS_INVALID_USER_BUFFER, -EIO, "STATUS_INVALID_USER_BUFFER"},
+	{STATUS_UNEXPECTED_IO_ERROR, -EIO, "STATUS_UNEXPECTED_IO_ERROR"},
+	{STATUS_UNEXPECTED_MM_CREATE_ERR, -EIO,
+	"STATUS_UNEXPECTED_MM_CREATE_ERR"},
+	{STATUS_UNEXPECTED_MM_MAP_ERROR, -EIO,
+	"STATUS_UNEXPECTED_MM_MAP_ERROR"},
+	{STATUS_UNEXPECTED_MM_EXTEND_ERR, -EIO,
+	"STATUS_UNEXPECTED_MM_EXTEND_ERR"},
+	{STATUS_NOT_LOGON_PROCESS, -EIO, "STATUS_NOT_LOGON_PROCESS"},
+	{STATUS_LOGON_SESSION_EXISTS, -EIO, "STATUS_LOGON_SESSION_EXISTS"},
+	{STATUS_INVALID_PARAMETER_1, -EINVAL, "STATUS_INVALID_PARAMETER_1"},
+	{STATUS_INVALID_PARAMETER_2, -EINVAL, "STATUS_INVALID_PARAMETER_2"},
+	{STATUS_INVALID_PARAMETER_3, -EINVAL, "STATUS_INVALID_PARAMETER_3"},
+	{STATUS_INVALID_PARAMETER_4, -EINVAL, "STATUS_INVALID_PARAMETER_4"},
+	{STATUS_INVALID_PARAMETER_5, -EINVAL, "STATUS_INVALID_PARAMETER_5"},
+	{STATUS_INVALID_PARAMETER_6, -EINVAL, "STATUS_INVALID_PARAMETER_6"},
+	{STATUS_INVALID_PARAMETER_7, -EINVAL, "STATUS_INVALID_PARAMETER_7"},
+	{STATUS_INVALID_PARAMETER_8, -EINVAL, "STATUS_INVALID_PARAMETER_8"},
+	{STATUS_INVALID_PARAMETER_9, -EINVAL, "STATUS_INVALID_PARAMETER_9"},
+	{STATUS_INVALID_PARAMETER_10, -EINVAL, "STATUS_INVALID_PARAMETER_10"},
+	{STATUS_INVALID_PARAMETER_11, -EINVAL, "STATUS_INVALID_PARAMETER_11"},
+	{STATUS_INVALID_PARAMETER_12, -EINVAL, "STATUS_INVALID_PARAMETER_12"},
+	{STATUS_REDIRECTOR_NOT_STARTED, -EIO, "STATUS_REDIRECTOR_NOT_STARTED"},
+	{STATUS_REDIRECTOR_STARTED, -EIO, "STATUS_REDIRECTOR_STARTED"},
+	{STATUS_STACK_OVERFLOW, -EIO, "STATUS_STACK_OVERFLOW"},
+	{STATUS_NO_SUCH_PACKAGE, -EIO, "STATUS_NO_SUCH_PACKAGE"},
+	{STATUS_BAD_FUNCTION_TABLE, -EIO, "STATUS_BAD_FUNCTION_TABLE"},
+	{STATUS_VARIABLE_NOT_FOUND, -EIO, "STATUS_VARIABLE_NOT_FOUND"},
+	{STATUS_DIRECTORY_NOT_EMPTY, -ENOTEMPTY, "STATUS_DIRECTORY_NOT_EMPTY"},
+	{STATUS_FILE_CORRUPT_ERROR, -EIO, "STATUS_FILE_CORRUPT_ERROR"},
+	{STATUS_NOT_A_DIRECTORY, -ENOTDIR, "STATUS_NOT_A_DIRECTORY"},
+	{STATUS_BAD_LOGON_SESSION_STATE, -EIO,
+	"STATUS_BAD_LOGON_SESSION_STATE"},
+	{STATUS_LOGON_SESSION_COLLISION, -EIO,
+	"STATUS_LOGON_SESSION_COLLISION"},
+	{STATUS_NAME_TOO_LONG, -ENAMETOOLONG, "STATUS_NAME_TOO_LONG"},
+	{STATUS_FILES_OPEN, -EIO, "STATUS_FILES_OPEN"},
+	{STATUS_CONNECTION_IN_USE, -EIO, "STATUS_CONNECTION_IN_USE"},
+	{STATUS_MESSAGE_NOT_FOUND, -EIO, "STATUS_MESSAGE_NOT_FOUND"},
+	{STATUS_PROCESS_IS_TERMINATING, -EIO, "STATUS_PROCESS_IS_TERMINATING"},
+	{STATUS_INVALID_LOGON_TYPE, -EIO, "STATUS_INVALID_LOGON_TYPE"},
+	{STATUS_NO_GUID_TRANSLATION, -EIO, "STATUS_NO_GUID_TRANSLATION"},
+	{STATUS_CANNOT_IMPERSONATE, -EIO, "STATUS_CANNOT_IMPERSONATE"},
+	{STATUS_IMAGE_ALREADY_LOADED, -EIO, "STATUS_IMAGE_ALREADY_LOADED"},
+	{STATUS_ABIOS_NOT_PRESENT, -EIO, "STATUS_ABIOS_NOT_PRESENT"},
+	{STATUS_ABIOS_LID_NOT_EXIST, -EIO, "STATUS_ABIOS_LID_NOT_EXIST"},
+	{STATUS_ABIOS_LID_ALREADY_OWNED, -EIO,
+	"STATUS_ABIOS_LID_ALREADY_OWNED"},
+	{STATUS_ABIOS_NOT_LID_OWNER, -EIO, "STATUS_ABIOS_NOT_LID_OWNER"},
+	{STATUS_ABIOS_INVALID_COMMAND, -EIO, "STATUS_ABIOS_INVALID_COMMAND"},
+	{STATUS_ABIOS_INVALID_LID, -EIO, "STATUS_ABIOS_INVALID_LID"},
+	{STATUS_ABIOS_SELECTOR_NOT_AVAILABLE, -EIO,
+	"STATUS_ABIOS_SELECTOR_NOT_AVAILABLE"},
+	{STATUS_ABIOS_INVALID_SELECTOR, -EIO, "STATUS_ABIOS_INVALID_SELECTOR"},
+	{STATUS_NO_LDT, -EIO, "STATUS_NO_LDT"},
+	{STATUS_INVALID_LDT_SIZE, -EIO, "STATUS_INVALID_LDT_SIZE"},
+	{STATUS_INVALID_LDT_OFFSET, -EIO, "STATUS_INVALID_LDT_OFFSET"},
+	{STATUS_INVALID_LDT_DESCRIPTOR, -EIO, "STATUS_INVALID_LDT_DESCRIPTOR"},
+	{STATUS_INVALID_IMAGE_NE_FORMAT, -EIO,
+	"STATUS_INVALID_IMAGE_NE_FORMAT"},
+	{STATUS_RXACT_INVALID_STATE, -EIO, "STATUS_RXACT_INVALID_STATE"},
+	{STATUS_RXACT_COMMIT_FAILURE, -EIO, "STATUS_RXACT_COMMIT_FAILURE"},
+	{STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"},
+	{STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"},
+	{STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"},
+	{STATUS_CANNOT_DELETE, -EIO, "STATUS_CANNOT_DELETE"},
+	{STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"},
+	{STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"},
+	{STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"},
+	{STATUS_SPECIAL_GROUP, -EIO, "STATUS_SPECIAL_GROUP"},
+	{STATUS_SPECIAL_USER, -EIO, "STATUS_SPECIAL_USER"},
+	{STATUS_MEMBERS_PRIMARY_GROUP, -EIO, "STATUS_MEMBERS_PRIMARY_GROUP"},
+	{STATUS_FILE_CLOSED, -EBADF, "STATUS_FILE_CLOSED"},
+	{STATUS_TOO_MANY_THREADS, -EIO, "STATUS_TOO_MANY_THREADS"},
+	{STATUS_THREAD_NOT_IN_PROCESS, -EIO, "STATUS_THREAD_NOT_IN_PROCESS"},
+	{STATUS_TOKEN_ALREADY_IN_USE, -EIO, "STATUS_TOKEN_ALREADY_IN_USE"},
+	{STATUS_PAGEFILE_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_PAGEFILE_QUOTA_EXCEEDED"},
+	{STATUS_COMMITMENT_LIMIT, -EIO, "STATUS_COMMITMENT_LIMIT"},
+	{STATUS_INVALID_IMAGE_LE_FORMAT, -EIO,
+	"STATUS_INVALID_IMAGE_LE_FORMAT"},
+	{STATUS_INVALID_IMAGE_NOT_MZ, -EIO, "STATUS_INVALID_IMAGE_NOT_MZ"},
+	{STATUS_INVALID_IMAGE_PROTECT, -EIO, "STATUS_INVALID_IMAGE_PROTECT"},
+	{STATUS_INVALID_IMAGE_WIN_16, -EIO, "STATUS_INVALID_IMAGE_WIN_16"},
+	{STATUS_LOGON_SERVER_CONFLICT, -EIO, "STATUS_LOGON_SERVER_CONFLICT"},
+	{STATUS_TIME_DIFFERENCE_AT_DC, -EIO, "STATUS_TIME_DIFFERENCE_AT_DC"},
+	{STATUS_SYNCHRONIZATION_REQUIRED, -EIO,
+	"STATUS_SYNCHRONIZATION_REQUIRED"},
+	{STATUS_DLL_NOT_FOUND, -ENOENT, "STATUS_DLL_NOT_FOUND"},
+	{STATUS_OPEN_FAILED, -EIO, "STATUS_OPEN_FAILED"},
+	{STATUS_IO_PRIVILEGE_FAILED, -EIO, "STATUS_IO_PRIVILEGE_FAILED"},
+	{STATUS_ORDINAL_NOT_FOUND, -EIO, "STATUS_ORDINAL_NOT_FOUND"},
+	{STATUS_ENTRYPOINT_NOT_FOUND, -EIO, "STATUS_ENTRYPOINT_NOT_FOUND"},
+	{STATUS_CONTROL_C_EXIT, -EIO, "STATUS_CONTROL_C_EXIT"},
+	{STATUS_LOCAL_DISCONNECT, -EIO, "STATUS_LOCAL_DISCONNECT"},
+	{STATUS_REMOTE_DISCONNECT, -ESHUTDOWN, "STATUS_REMOTE_DISCONNECT"},
+	{STATUS_REMOTE_RESOURCES, -EIO, "STATUS_REMOTE_RESOURCES"},
+	{STATUS_LINK_FAILED, -EXDEV, "STATUS_LINK_FAILED"},
+	{STATUS_LINK_TIMEOUT, -ETIMEDOUT, "STATUS_LINK_TIMEOUT"},
+	{STATUS_INVALID_CONNECTION, -EIO, "STATUS_INVALID_CONNECTION"},
+	{STATUS_INVALID_ADDRESS, -EIO, "STATUS_INVALID_ADDRESS"},
+	{STATUS_DLL_INIT_FAILED, -EIO, "STATUS_DLL_INIT_FAILED"},
+	{STATUS_MISSING_SYSTEMFILE, -EIO, "STATUS_MISSING_SYSTEMFILE"},
+	{STATUS_UNHANDLED_EXCEPTION, -EIO, "STATUS_UNHANDLED_EXCEPTION"},
+	{STATUS_APP_INIT_FAILURE, -EIO, "STATUS_APP_INIT_FAILURE"},
+	{STATUS_PAGEFILE_CREATE_FAILED, -EIO, "STATUS_PAGEFILE_CREATE_FAILED"},
+	{STATUS_NO_PAGEFILE, -EIO, "STATUS_NO_PAGEFILE"},
+	{STATUS_INVALID_LEVEL, -EIO, "STATUS_INVALID_LEVEL"},
+	{STATUS_WRONG_PASSWORD_CORE, -EIO, "STATUS_WRONG_PASSWORD_CORE"},
+	{STATUS_ILLEGAL_FLOAT_CONTEXT, -EIO, "STATUS_ILLEGAL_FLOAT_CONTEXT"},
+	{STATUS_PIPE_BROKEN, -EPIPE, "STATUS_PIPE_BROKEN"},
+	{STATUS_REGISTRY_CORRUPT, -EIO, "STATUS_REGISTRY_CORRUPT"},
+	{STATUS_REGISTRY_IO_FAILED, -EIO, "STATUS_REGISTRY_IO_FAILED"},
+	{STATUS_NO_EVENT_PAIR, -EIO, "STATUS_NO_EVENT_PAIR"},
+	{STATUS_UNRECOGNIZED_VOLUME, -EIO, "STATUS_UNRECOGNIZED_VOLUME"},
+	{STATUS_SERIAL_NO_DEVICE_INITED, -EIO,
+	"STATUS_SERIAL_NO_DEVICE_INITED"},
+	{STATUS_NO_SUCH_ALIAS, -EIO, "STATUS_NO_SUCH_ALIAS"},
+	{STATUS_MEMBER_NOT_IN_ALIAS, -EIO, "STATUS_MEMBER_NOT_IN_ALIAS"},
+	{STATUS_MEMBER_IN_ALIAS, -EIO, "STATUS_MEMBER_IN_ALIAS"},
+	{STATUS_ALIAS_EXISTS, -EIO, "STATUS_ALIAS_EXISTS"},
+	{STATUS_LOGON_NOT_GRANTED, -EIO, "STATUS_LOGON_NOT_GRANTED"},
+	{STATUS_TOO_MANY_SECRETS, -EIO, "STATUS_TOO_MANY_SECRETS"},
+	{STATUS_SECRET_TOO_LONG, -EIO, "STATUS_SECRET_TOO_LONG"},
+	{STATUS_INTERNAL_DB_ERROR, -EIO, "STATUS_INTERNAL_DB_ERROR"},
+	{STATUS_FULLSCREEN_MODE, -EIO, "STATUS_FULLSCREEN_MODE"},
+	{STATUS_TOO_MANY_CONTEXT_IDS, -EIO, "STATUS_TOO_MANY_CONTEXT_IDS"},
+	{STATUS_LOGON_TYPE_NOT_GRANTED, -EIO, "STATUS_LOGON_TYPE_NOT_GRANTED"},
+	{STATUS_NOT_REGISTRY_FILE, -EIO, "STATUS_NOT_REGISTRY_FILE"},
+	{STATUS_NT_CROSS_ENCRYPTION_REQUIRED, -EIO,
+	"STATUS_NT_CROSS_ENCRYPTION_REQUIRED"},
+	{STATUS_DOMAIN_CTRLR_CONFIG_ERROR, -EIO,
+	"STATUS_DOMAIN_CTRLR_CONFIG_ERROR"},
+	{STATUS_FT_MISSING_MEMBER, -EIO, "STATUS_FT_MISSING_MEMBER"},
+	{STATUS_ILL_FORMED_SERVICE_ENTRY, -EIO,
+	"STATUS_ILL_FORMED_SERVICE_ENTRY"},
+	{STATUS_ILLEGAL_CHARACTER, -EIO, "STATUS_ILLEGAL_CHARACTER"},
+	{STATUS_UNMAPPABLE_CHARACTER, -EIO, "STATUS_UNMAPPABLE_CHARACTER"},
+	{STATUS_UNDEFINED_CHARACTER, -EIO, "STATUS_UNDEFINED_CHARACTER"},
+	{STATUS_FLOPPY_VOLUME, -EIO, "STATUS_FLOPPY_VOLUME"},
+	{STATUS_FLOPPY_ID_MARK_NOT_FOUND, -EIO,
+	"STATUS_FLOPPY_ID_MARK_NOT_FOUND"},
+	{STATUS_FLOPPY_WRONG_CYLINDER, -EIO, "STATUS_FLOPPY_WRONG_CYLINDER"},
+	{STATUS_FLOPPY_UNKNOWN_ERROR, -EIO, "STATUS_FLOPPY_UNKNOWN_ERROR"},
+	{STATUS_FLOPPY_BAD_REGISTERS, -EIO, "STATUS_FLOPPY_BAD_REGISTERS"},
+	{STATUS_DISK_RECALIBRATE_FAILED, -EIO,
+	"STATUS_DISK_RECALIBRATE_FAILED"},
+	{STATUS_DISK_OPERATION_FAILED, -EIO, "STATUS_DISK_OPERATION_FAILED"},
+	{STATUS_DISK_RESET_FAILED, -EIO, "STATUS_DISK_RESET_FAILED"},
+	{STATUS_SHARED_IRQ_BUSY, -EBUSY, "STATUS_SHARED_IRQ_BUSY"},
+	{STATUS_FT_ORPHANING, -EIO, "STATUS_FT_ORPHANING"},
+	{STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT, -EIO,
+	"STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT"},
+	{STATUS_PARTITION_FAILURE, -EIO, "STATUS_PARTITION_FAILURE"},
+	{STATUS_INVALID_BLOCK_LENGTH, -EIO, "STATUS_INVALID_BLOCK_LENGTH"},
+	{STATUS_DEVICE_NOT_PARTITIONED, -EIO, "STATUS_DEVICE_NOT_PARTITIONED"},
+	{STATUS_UNABLE_TO_LOCK_MEDIA, -EIO, "STATUS_UNABLE_TO_LOCK_MEDIA"},
+	{STATUS_UNABLE_TO_UNLOAD_MEDIA, -EIO, "STATUS_UNABLE_TO_UNLOAD_MEDIA"},
+	{STATUS_EOM_OVERFLOW, -EIO, "STATUS_EOM_OVERFLOW"},
+	{STATUS_NO_MEDIA, -EIO, "STATUS_NO_MEDIA"},
+	{STATUS_NO_SUCH_MEMBER, -EIO, "STATUS_NO_SUCH_MEMBER"},
+	{STATUS_INVALID_MEMBER, -EIO, "STATUS_INVALID_MEMBER"},
+	{STATUS_KEY_DELETED, -EIO, "STATUS_KEY_DELETED"},
+	{STATUS_NO_LOG_SPACE, -EIO, "STATUS_NO_LOG_SPACE"},
+	{STATUS_TOO_MANY_SIDS, -EIO, "STATUS_TOO_MANY_SIDS"},
+	{STATUS_LM_CROSS_ENCRYPTION_REQUIRED, -EIO,
+	"STATUS_LM_CROSS_ENCRYPTION_REQUIRED"},
+	{STATUS_KEY_HAS_CHILDREN, -EIO, "STATUS_KEY_HAS_CHILDREN"},
+	{STATUS_CHILD_MUST_BE_VOLATILE, -EIO, "STATUS_CHILD_MUST_BE_VOLATILE"},
+	{STATUS_DEVICE_CONFIGURATION_ERROR, -EIO,
+	"STATUS_DEVICE_CONFIGURATION_ERROR"},
+	{STATUS_DRIVER_INTERNAL_ERROR, -EIO, "STATUS_DRIVER_INTERNAL_ERROR"},
+	{STATUS_INVALID_DEVICE_STATE, -EIO, "STATUS_INVALID_DEVICE_STATE"},
+	{STATUS_IO_DEVICE_ERROR, -EIO, "STATUS_IO_DEVICE_ERROR"},
+	{STATUS_DEVICE_PROTOCOL_ERROR, -EIO, "STATUS_DEVICE_PROTOCOL_ERROR"},
+	{STATUS_BACKUP_CONTROLLER, -EIO, "STATUS_BACKUP_CONTROLLER"},
+	{STATUS_LOG_FILE_FULL, -EIO, "STATUS_LOG_FILE_FULL"},
+	{STATUS_TOO_LATE, -EIO, "STATUS_TOO_LATE"},
+	{STATUS_NO_TRUST_LSA_SECRET, -EIO, "STATUS_NO_TRUST_LSA_SECRET"},
+	{STATUS_NO_TRUST_SAM_ACCOUNT, -EIO, "STATUS_NO_TRUST_SAM_ACCOUNT"},
+	{STATUS_TRUSTED_DOMAIN_FAILURE, -EIO, "STATUS_TRUSTED_DOMAIN_FAILURE"},
+	{STATUS_TRUSTED_RELATIONSHIP_FAILURE, -EIO,
+	"STATUS_TRUSTED_RELATIONSHIP_FAILURE"},
+	{STATUS_EVENTLOG_FILE_CORRUPT, -EIO, "STATUS_EVENTLOG_FILE_CORRUPT"},
+	{STATUS_EVENTLOG_CANT_START, -EIO, "STATUS_EVENTLOG_CANT_START"},
+	{STATUS_TRUST_FAILURE, -EIO, "STATUS_TRUST_FAILURE"},
+	{STATUS_MUTANT_LIMIT_EXCEEDED, -EIO, "STATUS_MUTANT_LIMIT_EXCEEDED"},
+	{STATUS_NETLOGON_NOT_STARTED, -EIO, "STATUS_NETLOGON_NOT_STARTED"},
+	{STATUS_ACCOUNT_EXPIRED, -EKEYEXPIRED, "STATUS_ACCOUNT_EXPIRED"},
+	{STATUS_POSSIBLE_DEADLOCK, -EIO, "STATUS_POSSIBLE_DEADLOCK"},
+	{STATUS_NETWORK_CREDENTIAL_CONFLICT, -EIO,
+	"STATUS_NETWORK_CREDENTIAL_CONFLICT"},
+	{STATUS_REMOTE_SESSION_LIMIT, -EIO, "STATUS_REMOTE_SESSION_LIMIT"},
+	{STATUS_EVENTLOG_FILE_CHANGED, -EIO, "STATUS_EVENTLOG_FILE_CHANGED"},
+	{STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT, -EIO,
+	"STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT"},
+	{STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT, -EIO,
+	"STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT"},
+	{STATUS_NOLOGON_SERVER_TRUST_ACCOUNT, -EIO,
+	"STATUS_NOLOGON_SERVER_TRUST_ACCOUNT"},
+	{STATUS_DOMAIN_TRUST_INCONSISTENT, -EIO,
+	"STATUS_DOMAIN_TRUST_INCONSISTENT"},
+	{STATUS_FS_DRIVER_REQUIRED, -EIO, "STATUS_FS_DRIVER_REQUIRED"},
+	{STATUS_IMAGE_ALREADY_LOADED_AS_DLL, -EIO,
+	"STATUS_IMAGE_ALREADY_LOADED_AS_DLL"},
+	{STATUS_NETWORK_OPEN_RESTRICTION, -EIO,
+	"STATUS_NETWORK_OPEN_RESTRICTION"},
+	{STATUS_NO_USER_SESSION_KEY, -EIO, "STATUS_NO_USER_SESSION_KEY"},
+	{STATUS_USER_SESSION_DELETED, -EIO, "STATUS_USER_SESSION_DELETED"},
+	{STATUS_RESOURCE_LANG_NOT_FOUND, -EIO,
+	"STATUS_RESOURCE_LANG_NOT_FOUND"},
+	{STATUS_INSUFF_SERVER_RESOURCES, -EIO,
+	"STATUS_INSUFF_SERVER_RESOURCES"},
+	{STATUS_INVALID_BUFFER_SIZE, -EIO, "STATUS_INVALID_BUFFER_SIZE"},
+	{STATUS_INVALID_ADDRESS_COMPONENT, -EIO,
+	"STATUS_INVALID_ADDRESS_COMPONENT"},
+	{STATUS_INVALID_ADDRESS_WILDCARD, -EIO,
+	"STATUS_INVALID_ADDRESS_WILDCARD"},
+	{STATUS_TOO_MANY_ADDRESSES, -EIO, "STATUS_TOO_MANY_ADDRESSES"},
+	{STATUS_ADDRESS_ALREADY_EXISTS, -EADDRINUSE,
+	"STATUS_ADDRESS_ALREADY_EXISTS"},
+	{STATUS_ADDRESS_CLOSED, -EIO, "STATUS_ADDRESS_CLOSED"},
+	{STATUS_CONNECTION_DISCONNECTED, -ECONNABORTED,
+	"STATUS_CONNECTION_DISCONNECTED"},
+	{STATUS_CONNECTION_RESET, -ENETRESET, "STATUS_CONNECTION_RESET"},
+	{STATUS_TOO_MANY_NODES, -EIO, "STATUS_TOO_MANY_NODES"},
+	{STATUS_TRANSACTION_ABORTED, -EIO, "STATUS_TRANSACTION_ABORTED"},
+	{STATUS_TRANSACTION_TIMED_OUT, -EIO, "STATUS_TRANSACTION_TIMED_OUT"},
+	{STATUS_TRANSACTION_NO_RELEASE, -EIO, "STATUS_TRANSACTION_NO_RELEASE"},
+	{STATUS_TRANSACTION_NO_MATCH, -EIO, "STATUS_TRANSACTION_NO_MATCH"},
+	{STATUS_TRANSACTION_RESPONDED, -EIO, "STATUS_TRANSACTION_RESPONDED"},
+	{STATUS_TRANSACTION_INVALID_ID, -EIO, "STATUS_TRANSACTION_INVALID_ID"},
+	{STATUS_TRANSACTION_INVALID_TYPE, -EIO,
+	"STATUS_TRANSACTION_INVALID_TYPE"},
+	{STATUS_NOT_SERVER_SESSION, -EIO, "STATUS_NOT_SERVER_SESSION"},
+	{STATUS_NOT_CLIENT_SESSION, -EIO, "STATUS_NOT_CLIENT_SESSION"},
+	{STATUS_CANNOT_LOAD_REGISTRY_FILE, -EIO,
+	"STATUS_CANNOT_LOAD_REGISTRY_FILE"},
+	{STATUS_DEBUG_ATTACH_FAILED, -EIO, "STATUS_DEBUG_ATTACH_FAILED"},
+	{STATUS_SYSTEM_PROCESS_TERMINATED, -EIO,
+	"STATUS_SYSTEM_PROCESS_TERMINATED"},
+	{STATUS_DATA_NOT_ACCEPTED, -EIO, "STATUS_DATA_NOT_ACCEPTED"},
+	{STATUS_NO_BROWSER_SERVERS_FOUND, -EIO,
+	"STATUS_NO_BROWSER_SERVERS_FOUND"},
+	{STATUS_VDM_HARD_ERROR, -EIO, "STATUS_VDM_HARD_ERROR"},
+	{STATUS_DRIVER_CANCEL_TIMEOUT, -EIO, "STATUS_DRIVER_CANCEL_TIMEOUT"},
+	{STATUS_REPLY_MESSAGE_MISMATCH, -EIO, "STATUS_REPLY_MESSAGE_MISMATCH"},
+	{STATUS_MAPPED_ALIGNMENT, -EIO, "STATUS_MAPPED_ALIGNMENT"},
+	{STATUS_IMAGE_CHECKSUM_MISMATCH, -EIO,
+	"STATUS_IMAGE_CHECKSUM_MISMATCH"},
+	{STATUS_LOST_WRITEBEHIND_DATA, -EIO, "STATUS_LOST_WRITEBEHIND_DATA"},
+	{STATUS_CLIENT_SERVER_PARAMETERS_INVALID, -EIO,
+	"STATUS_CLIENT_SERVER_PARAMETERS_INVALID"},
+	{STATUS_PASSWORD_MUST_CHANGE, -EIO, "STATUS_PASSWORD_MUST_CHANGE"},
+	{STATUS_NOT_FOUND, -ENOENT, "STATUS_NOT_FOUND"},
+	{STATUS_NOT_TINY_STREAM, -EIO, "STATUS_NOT_TINY_STREAM"},
+	{STATUS_RECOVERY_FAILURE, -EIO, "STATUS_RECOVERY_FAILURE"},
+	{STATUS_STACK_OVERFLOW_READ, -EIO, "STATUS_STACK_OVERFLOW_READ"},
+	{STATUS_FAIL_CHECK, -EIO, "STATUS_FAIL_CHECK"},
+	{STATUS_DUPLICATE_OBJECTID, -EIO, "STATUS_DUPLICATE_OBJECTID"},
+	{STATUS_OBJECTID_EXISTS, -EIO, "STATUS_OBJECTID_EXISTS"},
+	{STATUS_CONVERT_TO_LARGE, -EIO, "STATUS_CONVERT_TO_LARGE"},
+	{STATUS_RETRY, -EAGAIN, "STATUS_RETRY"},
+	{STATUS_FOUND_OUT_OF_SCOPE, -EIO, "STATUS_FOUND_OUT_OF_SCOPE"},
+	{STATUS_ALLOCATE_BUCKET, -EIO, "STATUS_ALLOCATE_BUCKET"},
+	{STATUS_PROPSET_NOT_FOUND, -EIO, "STATUS_PROPSET_NOT_FOUND"},
+	{STATUS_MARSHALL_OVERFLOW, -EIO, "STATUS_MARSHALL_OVERFLOW"},
+	{STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"},
+	{STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO,
+	"STATUS_DOMAIN_CONTROLLER_NOT_FOUND"},
+	{STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"},
+	{STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"},
+	{STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"},
+	{STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"},
+	{STATUS_ADDRESS_ALREADY_ASSOCIATED, -EIO,
+	"STATUS_ADDRESS_ALREADY_ASSOCIATED"},
+	{STATUS_ADDRESS_NOT_ASSOCIATED, -EIO, "STATUS_ADDRESS_NOT_ASSOCIATED"},
+	{STATUS_CONNECTION_INVALID, -EIO, "STATUS_CONNECTION_INVALID"},
+	{STATUS_CONNECTION_ACTIVE, -EIO, "STATUS_CONNECTION_ACTIVE"},
+	{STATUS_NETWORK_UNREACHABLE, -ENETUNREACH,
+	"STATUS_NETWORK_UNREACHABLE"},
+	{STATUS_HOST_UNREACHABLE, -EHOSTDOWN, "STATUS_HOST_UNREACHABLE"},
+	{STATUS_PROTOCOL_UNREACHABLE, -ENETUNREACH,
+	"STATUS_PROTOCOL_UNREACHABLE"},
+	{STATUS_PORT_UNREACHABLE, -ENETUNREACH, "STATUS_PORT_UNREACHABLE"},
+	{STATUS_REQUEST_ABORTED, -EIO, "STATUS_REQUEST_ABORTED"},
+	{STATUS_CONNECTION_ABORTED, -ECONNABORTED, "STATUS_CONNECTION_ABORTED"},
+	{STATUS_BAD_COMPRESSION_BUFFER, -EIO, "STATUS_BAD_COMPRESSION_BUFFER"},
+	{STATUS_USER_MAPPED_FILE, -EIO, "STATUS_USER_MAPPED_FILE"},
+	{STATUS_AUDIT_FAILED, -EIO, "STATUS_AUDIT_FAILED"},
+	{STATUS_TIMER_RESOLUTION_NOT_SET, -EIO,
+	"STATUS_TIMER_RESOLUTION_NOT_SET"},
+	{STATUS_CONNECTION_COUNT_LIMIT, -EIO, "STATUS_CONNECTION_COUNT_LIMIT"},
+	{STATUS_LOGIN_TIME_RESTRICTION, -EACCES,
+	"STATUS_LOGIN_TIME_RESTRICTION"},
+	{STATUS_LOGIN_WKSTA_RESTRICTION, -EACCES,
+	"STATUS_LOGIN_WKSTA_RESTRICTION"},
+	{STATUS_IMAGE_MP_UP_MISMATCH, -EIO, "STATUS_IMAGE_MP_UP_MISMATCH"},
+	{STATUS_INSUFFICIENT_LOGON_INFO, -EIO,
+	"STATUS_INSUFFICIENT_LOGON_INFO"},
+	{STATUS_BAD_DLL_ENTRYPOINT, -EIO, "STATUS_BAD_DLL_ENTRYPOINT"},
+	{STATUS_BAD_SERVICE_ENTRYPOINT, -EIO, "STATUS_BAD_SERVICE_ENTRYPOINT"},
+	{STATUS_LPC_REPLY_LOST, -EIO, "STATUS_LPC_REPLY_LOST"},
+	{STATUS_IP_ADDRESS_CONFLICT1, -EIO, "STATUS_IP_ADDRESS_CONFLICT1"},
+	{STATUS_IP_ADDRESS_CONFLICT2, -EIO, "STATUS_IP_ADDRESS_CONFLICT2"},
+	{STATUS_REGISTRY_QUOTA_LIMIT, -EDQUOT, "STATUS_REGISTRY_QUOTA_LIMIT"},
+	{STATUS_PATH_NOT_COVERED, -EREMOTE, "STATUS_PATH_NOT_COVERED"},
+	{STATUS_NO_CALLBACK_ACTIVE, -EIO, "STATUS_NO_CALLBACK_ACTIVE"},
+	{STATUS_LICENSE_QUOTA_EXCEEDED, -EACCES,
+	"STATUS_LICENSE_QUOTA_EXCEEDED"},
+	{STATUS_PWD_TOO_SHORT, -EIO, "STATUS_PWD_TOO_SHORT"},
+	{STATUS_PWD_TOO_RECENT, -EIO, "STATUS_PWD_TOO_RECENT"},
+	{STATUS_PWD_HISTORY_CONFLICT, -EIO, "STATUS_PWD_HISTORY_CONFLICT"},
+	{STATUS_PLUGPLAY_NO_DEVICE, -EIO, "STATUS_PLUGPLAY_NO_DEVICE"},
+	{STATUS_UNSUPPORTED_COMPRESSION, -EIO,
+	"STATUS_UNSUPPORTED_COMPRESSION"},
+	{STATUS_INVALID_HW_PROFILE, -EIO, "STATUS_INVALID_HW_PROFILE"},
+	{STATUS_INVALID_PLUGPLAY_DEVICE_PATH, -EIO,
+	"STATUS_INVALID_PLUGPLAY_DEVICE_PATH"},
+	{STATUS_DRIVER_ORDINAL_NOT_FOUND, -EIO,
+	"STATUS_DRIVER_ORDINAL_NOT_FOUND"},
+	{STATUS_DRIVER_ENTRYPOINT_NOT_FOUND, -EIO,
+	"STATUS_DRIVER_ENTRYPOINT_NOT_FOUND"},
+	{STATUS_RESOURCE_NOT_OWNED, -EIO, "STATUS_RESOURCE_NOT_OWNED"},
+	{STATUS_TOO_MANY_LINKS, -EMLINK, "STATUS_TOO_MANY_LINKS"},
+	{STATUS_QUOTA_LIST_INCONSISTENT, -EIO,
+	"STATUS_QUOTA_LIST_INCONSISTENT"},
+	{STATUS_FILE_IS_OFFLINE, -EIO, "STATUS_FILE_IS_OFFLINE"},
+	{STATUS_EVALUATION_EXPIRATION, -EIO, "STATUS_EVALUATION_EXPIRATION"},
+	{STATUS_ILLEGAL_DLL_RELOCATION, -EIO, "STATUS_ILLEGAL_DLL_RELOCATION"},
+	{STATUS_LICENSE_VIOLATION, -EIO, "STATUS_LICENSE_VIOLATION"},
+	{STATUS_DLL_INIT_FAILED_LOGOFF, -EIO, "STATUS_DLL_INIT_FAILED_LOGOFF"},
+	{STATUS_DRIVER_UNABLE_TO_LOAD, -EIO, "STATUS_DRIVER_UNABLE_TO_LOAD"},
+	{STATUS_DFS_UNAVAILABLE, -EIO, "STATUS_DFS_UNAVAILABLE"},
+	{STATUS_VOLUME_DISMOUNTED, -EIO, "STATUS_VOLUME_DISMOUNTED"},
+	{STATUS_WX86_INTERNAL_ERROR, -EIO, "STATUS_WX86_INTERNAL_ERROR"},
+	{STATUS_WX86_FLOAT_STACK_CHECK, -EIO, "STATUS_WX86_FLOAT_STACK_CHECK"},
+	{STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"},
+	{STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"},
+	{STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"},
+	{STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"},
+	{STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"},
+	{STATUS_IO_REPARSE_TAG_MISMATCH, -EIO,
+	"STATUS_IO_REPARSE_TAG_MISMATCH"},
+	{STATUS_IO_REPARSE_DATA_INVALID, -EIO,
+	"STATUS_IO_REPARSE_DATA_INVALID"},
+	{STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO,
+	"STATUS_IO_REPARSE_TAG_NOT_HANDLED"},
+	{STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO,
+	"STATUS_REPARSE_POINT_NOT_RESOLVED"},
+	{STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO,
+	"STATUS_DIRECTORY_IS_A_REPARSE_POINT"},
+	{STATUS_RANGE_LIST_CONFLICT, -EIO, "STATUS_RANGE_LIST_CONFLICT"},
+	{STATUS_SOURCE_ELEMENT_EMPTY, -EIO, "STATUS_SOURCE_ELEMENT_EMPTY"},
+	{STATUS_DESTINATION_ELEMENT_FULL, -EIO,
+	"STATUS_DESTINATION_ELEMENT_FULL"},
+	{STATUS_ILLEGAL_ELEMENT_ADDRESS, -EIO,
+	"STATUS_ILLEGAL_ELEMENT_ADDRESS"},
+	{STATUS_MAGAZINE_NOT_PRESENT, -EIO, "STATUS_MAGAZINE_NOT_PRESENT"},
+	{STATUS_REINITIALIZATION_NEEDED, -EIO,
+	"STATUS_REINITIALIZATION_NEEDED"},
+	{STATUS_ENCRYPTION_FAILED, -EIO, "STATUS_ENCRYPTION_FAILED"},
+	{STATUS_DECRYPTION_FAILED, -EIO, "STATUS_DECRYPTION_FAILED"},
+	{STATUS_RANGE_NOT_FOUND, -EIO, "STATUS_RANGE_NOT_FOUND"},
+	{STATUS_NO_RECOVERY_POLICY, -EIO, "STATUS_NO_RECOVERY_POLICY"},
+	{STATUS_NO_EFS, -EIO, "STATUS_NO_EFS"},
+	{STATUS_WRONG_EFS, -EIO, "STATUS_WRONG_EFS"},
+	{STATUS_NO_USER_KEYS, -EIO, "STATUS_NO_USER_KEYS"},
+	{STATUS_FILE_NOT_ENCRYPTED, -EIO, "STATUS_FILE_NOT_ENCRYPTED"},
+	{STATUS_NOT_EXPORT_FORMAT, -EIO, "STATUS_NOT_EXPORT_FORMAT"},
+	{STATUS_FILE_ENCRYPTED, -EIO, "STATUS_FILE_ENCRYPTED"},
+	{STATUS_WMI_GUID_NOT_FOUND, -EIO, "STATUS_WMI_GUID_NOT_FOUND"},
+	{STATUS_WMI_INSTANCE_NOT_FOUND, -EIO, "STATUS_WMI_INSTANCE_NOT_FOUND"},
+	{STATUS_WMI_ITEMID_NOT_FOUND, -EIO, "STATUS_WMI_ITEMID_NOT_FOUND"},
+	{STATUS_WMI_TRY_AGAIN, -EIO, "STATUS_WMI_TRY_AGAIN"},
+	{STATUS_SHARED_POLICY, -EIO, "STATUS_SHARED_POLICY"},
+	{STATUS_POLICY_OBJECT_NOT_FOUND, -EIO,
+	"STATUS_POLICY_OBJECT_NOT_FOUND"},
+	{STATUS_POLICY_ONLY_IN_DS, -EIO, "STATUS_POLICY_ONLY_IN_DS"},
+	{STATUS_VOLUME_NOT_UPGRADED, -EIO, "STATUS_VOLUME_NOT_UPGRADED"},
+	{STATUS_REMOTE_STORAGE_NOT_ACTIVE, -EIO,
+	"STATUS_REMOTE_STORAGE_NOT_ACTIVE"},
+	{STATUS_REMOTE_STORAGE_MEDIA_ERROR, -EIO,
+	"STATUS_REMOTE_STORAGE_MEDIA_ERROR"},
+	{STATUS_NO_TRACKING_SERVICE, -EIO, "STATUS_NO_TRACKING_SERVICE"},
+	{STATUS_SERVER_SID_MISMATCH, -EIO, "STATUS_SERVER_SID_MISMATCH"},
+	{STATUS_DS_NO_ATTRIBUTE_OR_VALUE, -EIO,
+	"STATUS_DS_NO_ATTRIBUTE_OR_VALUE"},
+	{STATUS_DS_INVALID_ATTRIBUTE_SYNTAX, -EIO,
+	"STATUS_DS_INVALID_ATTRIBUTE_SYNTAX"},
+	{STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED, -EIO,
+	"STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED"},
+	{STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS, -EIO,
+	"STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS"},
+	{STATUS_DS_BUSY, -EBUSY, "STATUS_DS_BUSY"},
+	{STATUS_DS_UNAVAILABLE, -EIO, "STATUS_DS_UNAVAILABLE"},
+	{STATUS_DS_NO_RIDS_ALLOCATED, -EIO, "STATUS_DS_NO_RIDS_ALLOCATED"},
+	{STATUS_DS_NO_MORE_RIDS, -EIO, "STATUS_DS_NO_MORE_RIDS"},
+	{STATUS_DS_INCORRECT_ROLE_OWNER, -EIO,
+	"STATUS_DS_INCORRECT_ROLE_OWNER"},
+	{STATUS_DS_RIDMGR_INIT_ERROR, -EIO, "STATUS_DS_RIDMGR_INIT_ERROR"},
+	{STATUS_DS_OBJ_CLASS_VIOLATION, -EIO, "STATUS_DS_OBJ_CLASS_VIOLATION"},
+	{STATUS_DS_CANT_ON_NON_LEAF, -EIO, "STATUS_DS_CANT_ON_NON_LEAF"},
+	{STATUS_DS_CANT_ON_RDN, -EIO, "STATUS_DS_CANT_ON_RDN"},
+	{STATUS_DS_CANT_MOD_OBJ_CLASS, -EIO, "STATUS_DS_CANT_MOD_OBJ_CLASS"},
+	{STATUS_DS_CROSS_DOM_MOVE_FAILED, -EIO,
+	"STATUS_DS_CROSS_DOM_MOVE_FAILED"},
+	{STATUS_DS_GC_NOT_AVAILABLE, -EIO, "STATUS_DS_GC_NOT_AVAILABLE"},
+	{STATUS_DIRECTORY_SERVICE_REQUIRED, -EIO,
+	"STATUS_DIRECTORY_SERVICE_REQUIRED"},
+	{STATUS_REPARSE_ATTRIBUTE_CONFLICT, -EIO,
+	"STATUS_REPARSE_ATTRIBUTE_CONFLICT"},
+	{STATUS_CANT_ENABLE_DENY_ONLY, -EIO, "STATUS_CANT_ENABLE_DENY_ONLY"},
+	{STATUS_FLOAT_MULTIPLE_FAULTS, -EIO, "STATUS_FLOAT_MULTIPLE_FAULTS"},
+	{STATUS_FLOAT_MULTIPLE_TRAPS, -EIO, "STATUS_FLOAT_MULTIPLE_TRAPS"},
+	{STATUS_DEVICE_REMOVED, -EIO, "STATUS_DEVICE_REMOVED"},
+	{STATUS_JOURNAL_DELETE_IN_PROGRESS, -EIO,
+	"STATUS_JOURNAL_DELETE_IN_PROGRESS"},
+	{STATUS_JOURNAL_NOT_ACTIVE, -EIO, "STATUS_JOURNAL_NOT_ACTIVE"},
+	{STATUS_NOINTERFACE, -EIO, "STATUS_NOINTERFACE"},
+	{STATUS_DS_ADMIN_LIMIT_EXCEEDED, -EIO,
+	"STATUS_DS_ADMIN_LIMIT_EXCEEDED"},
+	{STATUS_DRIVER_FAILED_SLEEP, -EIO, "STATUS_DRIVER_FAILED_SLEEP"},
+	{STATUS_MUTUAL_AUTHENTICATION_FAILED, -EIO,
+	"STATUS_MUTUAL_AUTHENTICATION_FAILED"},
+	{STATUS_CORRUPT_SYSTEM_FILE, -EIO, "STATUS_CORRUPT_SYSTEM_FILE"},
+	{STATUS_DATATYPE_MISALIGNMENT_ERROR, -EIO,
+	"STATUS_DATATYPE_MISALIGNMENT_ERROR"},
+	{STATUS_WMI_READ_ONLY, -EROFS, "STATUS_WMI_READ_ONLY"},
+	{STATUS_WMI_SET_FAILURE, -EIO, "STATUS_WMI_SET_FAILURE"},
+	{STATUS_COMMITMENT_MINIMUM, -EIO, "STATUS_COMMITMENT_MINIMUM"},
+	{STATUS_REG_NAT_CONSUMPTION, -EIO, "STATUS_REG_NAT_CONSUMPTION"},
+	{STATUS_TRANSPORT_FULL, -EIO, "STATUS_TRANSPORT_FULL"},
+	{STATUS_DS_SAM_INIT_FAILURE, -EIO, "STATUS_DS_SAM_INIT_FAILURE"},
+	{STATUS_ONLY_IF_CONNECTED, -EIO, "STATUS_ONLY_IF_CONNECTED"},
+	{STATUS_DS_SENSITIVE_GROUP_VIOLATION, -EIO,
+	"STATUS_DS_SENSITIVE_GROUP_VIOLATION"},
+	{STATUS_PNP_RESTART_ENUMERATION, -EIO,
+	"STATUS_PNP_RESTART_ENUMERATION"},
+	{STATUS_JOURNAL_ENTRY_DELETED, -EIO, "STATUS_JOURNAL_ENTRY_DELETED"},
+	{STATUS_DS_CANT_MOD_PRIMARYGROUPID, -EIO,
+	"STATUS_DS_CANT_MOD_PRIMARYGROUPID"},
+	{STATUS_SYSTEM_IMAGE_BAD_SIGNATURE, -EIO,
+	"STATUS_SYSTEM_IMAGE_BAD_SIGNATURE"},
+	{STATUS_PNP_REBOOT_REQUIRED, -EIO, "STATUS_PNP_REBOOT_REQUIRED"},
+	{STATUS_POWER_STATE_INVALID, -EIO, "STATUS_POWER_STATE_INVALID"},
+	{STATUS_DS_INVALID_GROUP_TYPE, -EIO, "STATUS_DS_INVALID_GROUP_TYPE"},
+	{STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN, -EIO,
+	"STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN"},
+	{STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN, -EIO,
+	"STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN"},
+	{STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER, -EIO,
+	"STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER"},
+	{STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER, -EIO,
+	"STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER"},
+	{STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER, -EIO,
+	"STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER"},
+	{STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER, -EIO,
+	"STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER"},
+	{STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER, -EIO,
+	"STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER"},
+	{STATUS_DS_HAVE_PRIMARY_MEMBERS, -EIO,
+	"STATUS_DS_HAVE_PRIMARY_MEMBERS"},
+	{STATUS_WMI_NOT_SUPPORTED, -EOPNOTSUPP, "STATUS_WMI_NOT_SUPPORTED"},
+	{STATUS_INSUFFICIENT_POWER, -EIO, "STATUS_INSUFFICIENT_POWER"},
+	{STATUS_SAM_NEED_BOOTKEY_PASSWORD, -EIO,
+	"STATUS_SAM_NEED_BOOTKEY_PASSWORD"},
+	{STATUS_SAM_NEED_BOOTKEY_FLOPPY, -EIO,
+	"STATUS_SAM_NEED_BOOTKEY_FLOPPY"},
+	{STATUS_DS_CANT_START, -EIO, "STATUS_DS_CANT_START"},
+	{STATUS_DS_INIT_FAILURE, -EIO, "STATUS_DS_INIT_FAILURE"},
+	{STATUS_SAM_INIT_FAILURE, -EIO, "STATUS_SAM_INIT_FAILURE"},
+	{STATUS_DS_GC_REQUIRED, -EIO, "STATUS_DS_GC_REQUIRED"},
+	{STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY, -EIO,
+	"STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY"},
+	{STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS, -EIO,
+	"STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS"},
+	{STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED"},
+	{STATUS_MULTIPLE_FAULT_VIOLATION, -EIO,
+	"STATUS_MULTIPLE_FAULT_VIOLATION"},
+	{STATUS_CURRENT_DOMAIN_NOT_ALLOWED, -EIO,
+	"STATUS_CURRENT_DOMAIN_NOT_ALLOWED"},
+	{STATUS_CANNOT_MAKE, -EIO, "STATUS_CANNOT_MAKE"},
+	{STATUS_SYSTEM_SHUTDOWN, -EIO, "STATUS_SYSTEM_SHUTDOWN"},
+	{STATUS_DS_INIT_FAILURE_CONSOLE, -EIO,
+	"STATUS_DS_INIT_FAILURE_CONSOLE"},
+	{STATUS_DS_SAM_INIT_FAILURE_CONSOLE, -EIO,
+	"STATUS_DS_SAM_INIT_FAILURE_CONSOLE"},
+	{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
+	"STATUS_UNFINISHED_CONTEXT_DELETED"},
+	{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
+	{STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+	{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
+	{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
+	"STATUS_WRONG_CREDENTIAL_HANDLE"},
+	{STATUS_CRYPTO_SYSTEM_INVALID, -EIO, "STATUS_CRYPTO_SYSTEM_INVALID"},
+	{STATUS_MAX_REFERRALS_EXCEEDED, -EIO, "STATUS_MAX_REFERRALS_EXCEEDED"},
+	{STATUS_MUST_BE_KDC, -EIO, "STATUS_MUST_BE_KDC"},
+	{STATUS_STRONG_CRYPTO_NOT_SUPPORTED, -EIO,
+	"STATUS_STRONG_CRYPTO_NOT_SUPPORTED"},
+	{STATUS_TOO_MANY_PRINCIPALS, -EIO, "STATUS_TOO_MANY_PRINCIPALS"},
+	{STATUS_NO_PA_DATA, -EIO, "STATUS_NO_PA_DATA"},
+	{STATUS_PKINIT_NAME_MISMATCH, -EIO, "STATUS_PKINIT_NAME_MISMATCH"},
+	{STATUS_SMARTCARD_LOGON_REQUIRED, -EIO,
+	"STATUS_SMARTCARD_LOGON_REQUIRED"},
+	{STATUS_KDC_INVALID_REQUEST, -EIO, "STATUS_KDC_INVALID_REQUEST"},
+	{STATUS_KDC_UNABLE_TO_REFER, -EIO, "STATUS_KDC_UNABLE_TO_REFER"},
+	{STATUS_KDC_UNKNOWN_ETYPE, -EIO, "STATUS_KDC_UNKNOWN_ETYPE"},
+	{STATUS_SHUTDOWN_IN_PROGRESS, -EIO, "STATUS_SHUTDOWN_IN_PROGRESS"},
+	{STATUS_SERVER_SHUTDOWN_IN_PROGRESS, -EIO,
+	"STATUS_SERVER_SHUTDOWN_IN_PROGRESS"},
+	{STATUS_NOT_SUPPORTED_ON_SBS, -EOPNOTSUPP,
+	"STATUS_NOT_SUPPORTED_ON_SBS"},
+	{STATUS_WMI_GUID_DISCONNECTED, -EIO, "STATUS_WMI_GUID_DISCONNECTED"},
+	{STATUS_WMI_ALREADY_DISABLED, -EIO, "STATUS_WMI_ALREADY_DISABLED"},
+	{STATUS_WMI_ALREADY_ENABLED, -EIO, "STATUS_WMI_ALREADY_ENABLED"},
+	{STATUS_MFT_TOO_FRAGMENTED, -EIO, "STATUS_MFT_TOO_FRAGMENTED"},
+	{STATUS_COPY_PROTECTION_FAILURE, -EIO,
+	"STATUS_COPY_PROTECTION_FAILURE"},
+	{STATUS_CSS_AUTHENTICATION_FAILURE, -EIO,
+	"STATUS_CSS_AUTHENTICATION_FAILURE"},
+	{STATUS_CSS_KEY_NOT_PRESENT, -EIO, "STATUS_CSS_KEY_NOT_PRESENT"},
+	{STATUS_CSS_KEY_NOT_ESTABLISHED, -EIO,
+	"STATUS_CSS_KEY_NOT_ESTABLISHED"},
+	{STATUS_CSS_SCRAMBLED_SECTOR, -EIO, "STATUS_CSS_SCRAMBLED_SECTOR"},
+	{STATUS_CSS_REGION_MISMATCH, -EIO, "STATUS_CSS_REGION_MISMATCH"},
+	{STATUS_CSS_RESETS_EXHAUSTED, -EIO, "STATUS_CSS_RESETS_EXHAUSTED"},
+	{STATUS_PKINIT_FAILURE, -EIO, "STATUS_PKINIT_FAILURE"},
+	{STATUS_SMARTCARD_SUBSYSTEM_FAILURE, -EIO,
+	"STATUS_SMARTCARD_SUBSYSTEM_FAILURE"},
+	{STATUS_NO_KERB_KEY, -EIO, "STATUS_NO_KERB_KEY"},
+	{STATUS_HOST_DOWN, -EIO, "STATUS_HOST_DOWN"},
+	{STATUS_UNSUPPORTED_PREAUTH, -EIO, "STATUS_UNSUPPORTED_PREAUTH"},
+	{STATUS_EFS_ALG_BLOB_TOO_BIG, -EIO, "STATUS_EFS_ALG_BLOB_TOO_BIG"},
+	{STATUS_PORT_NOT_SET, -EIO, "STATUS_PORT_NOT_SET"},
+	{STATUS_DEBUGGER_INACTIVE, -EIO, "STATUS_DEBUGGER_INACTIVE"},
+	{STATUS_DS_VERSION_CHECK_FAILURE, -EIO,
+	"STATUS_DS_VERSION_CHECK_FAILURE"},
+	{STATUS_AUDITING_DISABLED, -EIO, "STATUS_AUDITING_DISABLED"},
+	{STATUS_PRENT4_MACHINE_ACCOUNT, -EIO, "STATUS_PRENT4_MACHINE_ACCOUNT"},
+	{STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER, -EIO,
+	"STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER"},
+	{STATUS_INVALID_IMAGE_WIN_32, -EIO, "STATUS_INVALID_IMAGE_WIN_32"},
+	{STATUS_INVALID_IMAGE_WIN_64, -EIO, "STATUS_INVALID_IMAGE_WIN_64"},
+	{STATUS_BAD_BINDINGS, -EIO, "STATUS_BAD_BINDINGS"},
+	{STATUS_NETWORK_SESSION_EXPIRED, -EIO,
+	"STATUS_NETWORK_SESSION_EXPIRED"},
+	{STATUS_APPHELP_BLOCK, -EIO, "STATUS_APPHELP_BLOCK"},
+	{STATUS_ALL_SIDS_FILTERED, -EIO, "STATUS_ALL_SIDS_FILTERED"},
+	{STATUS_NOT_SAFE_MODE_DRIVER, -EIO, "STATUS_NOT_SAFE_MODE_DRIVER"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_PATH, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_PATH"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER"},
+	{STATUS_ACCESS_DISABLED_BY_POLICY_OTHER, -EACCES,
+	"STATUS_ACCESS_DISABLED_BY_POLICY_OTHER"},
+	{STATUS_FAILED_DRIVER_ENTRY, -EIO, "STATUS_FAILED_DRIVER_ENTRY"},
+	{STATUS_DEVICE_ENUMERATION_ERROR, -EIO,
+	"STATUS_DEVICE_ENUMERATION_ERROR"},
+	{STATUS_MOUNT_POINT_NOT_RESOLVED, -EIO,
+	"STATUS_MOUNT_POINT_NOT_RESOLVED"},
+	{STATUS_INVALID_DEVICE_OBJECT_PARAMETER, -EIO,
+	"STATUS_INVALID_DEVICE_OBJECT_PARAMETER"},
+	{STATUS_MCA_OCCURED, -EIO, "STATUS_MCA_OCCURED"},
+	{STATUS_DRIVER_BLOCKED_CRITICAL, -EIO,
+	"STATUS_DRIVER_BLOCKED_CRITICAL"},
+	{STATUS_DRIVER_BLOCKED, -EIO, "STATUS_DRIVER_BLOCKED"},
+	{STATUS_DRIVER_DATABASE_ERROR, -EIO, "STATUS_DRIVER_DATABASE_ERROR"},
+	{STATUS_SYSTEM_HIVE_TOO_LARGE, -EIO, "STATUS_SYSTEM_HIVE_TOO_LARGE"},
+	{STATUS_INVALID_IMPORT_OF_NON_DLL, -EIO,
+	"STATUS_INVALID_IMPORT_OF_NON_DLL"},
+	{STATUS_NO_SECRETS, -EIO, "STATUS_NO_SECRETS"},
+	{STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY, -EACCES,
+	"STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY"},
+	{STATUS_FAILED_STACK_SWITCH, -EIO, "STATUS_FAILED_STACK_SWITCH"},
+	{STATUS_HEAP_CORRUPTION, -EIO, "STATUS_HEAP_CORRUPTION"},
+	{STATUS_SMARTCARD_WRONG_PIN, -EIO, "STATUS_SMARTCARD_WRONG_PIN"},
+	{STATUS_SMARTCARD_CARD_BLOCKED, -EIO, "STATUS_SMARTCARD_CARD_BLOCKED"},
+	{STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED, -EIO,
+	"STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED"},
+	{STATUS_SMARTCARD_NO_CARD, -EIO, "STATUS_SMARTCARD_NO_CARD"},
+	{STATUS_SMARTCARD_NO_KEY_CONTAINER, -EIO,
+	"STATUS_SMARTCARD_NO_KEY_CONTAINER"},
+	{STATUS_SMARTCARD_NO_CERTIFICATE, -EIO,
+	"STATUS_SMARTCARD_NO_CERTIFICATE"},
+	{STATUS_SMARTCARD_NO_KEYSET, -EIO, "STATUS_SMARTCARD_NO_KEYSET"},
+	{STATUS_SMARTCARD_IO_ERROR, -EIO, "STATUS_SMARTCARD_IO_ERROR"},
+	{STATUS_DOWNGRADE_DETECTED, -EIO, "STATUS_DOWNGRADE_DETECTED"},
+	{STATUS_SMARTCARD_CERT_REVOKED, -EIO, "STATUS_SMARTCARD_CERT_REVOKED"},
+	{STATUS_ISSUING_CA_UNTRUSTED, -EIO, "STATUS_ISSUING_CA_UNTRUSTED"},
+	{STATUS_REVOCATION_OFFLINE_C, -EIO, "STATUS_REVOCATION_OFFLINE_C"},
+	{STATUS_PKINIT_CLIENT_FAILURE, -EIO, "STATUS_PKINIT_CLIENT_FAILURE"},
+	{STATUS_SMARTCARD_CERT_EXPIRED, -EIO, "STATUS_SMARTCARD_CERT_EXPIRED"},
+	{STATUS_DRIVER_FAILED_PRIOR_UNLOAD, -EIO,
+	"STATUS_DRIVER_FAILED_PRIOR_UNLOAD"},
+	{STATUS_SMARTCARD_SILENT_CONTEXT, -EIO,
+	"STATUS_SMARTCARD_SILENT_CONTEXT"},
+	{STATUS_PER_USER_TRUST_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_PER_USER_TRUST_QUOTA_EXCEEDED"},
+	{STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED"},
+	{STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED"},
+	{STATUS_DS_NAME_NOT_UNIQUE, -EIO, "STATUS_DS_NAME_NOT_UNIQUE"},
+	{STATUS_DS_DUPLICATE_ID_FOUND, -EIO, "STATUS_DS_DUPLICATE_ID_FOUND"},
+	{STATUS_DS_GROUP_CONVERSION_ERROR, -EIO,
+	"STATUS_DS_GROUP_CONVERSION_ERROR"},
+	{STATUS_VOLSNAP_PREPARE_HIBERNATE, -EIO,
+	"STATUS_VOLSNAP_PREPARE_HIBERNATE"},
+	{STATUS_USER2USER_REQUIRED, -EIO, "STATUS_USER2USER_REQUIRED"},
+	{STATUS_STACK_BUFFER_OVERRUN, -EIO, "STATUS_STACK_BUFFER_OVERRUN"},
+	{STATUS_NO_S4U_PROT_SUPPORT, -EIO, "STATUS_NO_S4U_PROT_SUPPORT"},
+	{STATUS_CROSSREALM_DELEGATION_FAILURE, -EIO,
+	"STATUS_CROSSREALM_DELEGATION_FAILURE"},
+	{STATUS_REVOCATION_OFFLINE_KDC, -EIO, "STATUS_REVOCATION_OFFLINE_KDC"},
+	{STATUS_ISSUING_CA_UNTRUSTED_KDC, -EIO,
+	"STATUS_ISSUING_CA_UNTRUSTED_KDC"},
+	{STATUS_KDC_CERT_EXPIRED, -EIO, "STATUS_KDC_CERT_EXPIRED"},
+	{STATUS_KDC_CERT_REVOKED, -EIO, "STATUS_KDC_CERT_REVOKED"},
+	{STATUS_PARAMETER_QUOTA_EXCEEDED, -EDQUOT,
+	"STATUS_PARAMETER_QUOTA_EXCEEDED"},
+	{STATUS_HIBERNATION_FAILURE, -EIO, "STATUS_HIBERNATION_FAILURE"},
+	{STATUS_DELAY_LOAD_FAILED, -EIO, "STATUS_DELAY_LOAD_FAILED"},
+	{STATUS_AUTHENTICATION_FIREWALL_FAILED, -EIO,
+	"STATUS_AUTHENTICATION_FIREWALL_FAILED"},
+	{STATUS_VDM_DISALLOWED, -EIO, "STATUS_VDM_DISALLOWED"},
+	{STATUS_HUNG_DISPLAY_DRIVER_THREAD, -EIO,
+	"STATUS_HUNG_DISPLAY_DRIVER_THREAD"},
+	{STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE, -EIO,
+	"STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE"},
+	{STATUS_INVALID_CRUNTIME_PARAMETER, -EIO,
+	"STATUS_INVALID_CRUNTIME_PARAMETER"},
+	{STATUS_NTLM_BLOCKED, -EIO, "STATUS_NTLM_BLOCKED"},
+	{STATUS_ASSERTION_FAILURE, -EIO, "STATUS_ASSERTION_FAILURE"},
+	{STATUS_VERIFIER_STOP, -EIO, "STATUS_VERIFIER_STOP"},
+	{STATUS_CALLBACK_POP_STACK, -EIO, "STATUS_CALLBACK_POP_STACK"},
+	{STATUS_INCOMPATIBLE_DRIVER_BLOCKED, -EIO,
+	"STATUS_INCOMPATIBLE_DRIVER_BLOCKED"},
+	{STATUS_HIVE_UNLOADED, -EIO, "STATUS_HIVE_UNLOADED"},
+	{STATUS_COMPRESSION_DISABLED, -EIO, "STATUS_COMPRESSION_DISABLED"},
+	{STATUS_FILE_SYSTEM_LIMITATION, -EIO, "STATUS_FILE_SYSTEM_LIMITATION"},
+	{STATUS_INVALID_IMAGE_HASH, -EIO, "STATUS_INVALID_IMAGE_HASH"},
+	{STATUS_NOT_CAPABLE, -EIO, "STATUS_NOT_CAPABLE"},
+	{STATUS_REQUEST_OUT_OF_SEQUENCE, -EIO,
+	"STATUS_REQUEST_OUT_OF_SEQUENCE"},
+	{STATUS_IMPLEMENTATION_LIMIT, -EIO, "STATUS_IMPLEMENTATION_LIMIT"},
+	{STATUS_ELEVATION_REQUIRED, -EIO, "STATUS_ELEVATION_REQUIRED"},
+	{STATUS_BEYOND_VDL, -EIO, "STATUS_BEYOND_VDL"},
+	{STATUS_ENCOUNTERED_WRITE_IN_PROGRESS, -EIO,
+	"STATUS_ENCOUNTERED_WRITE_IN_PROGRESS"},
+	{STATUS_PTE_CHANGED, -EIO, "STATUS_PTE_CHANGED"},
+	{STATUS_PURGE_FAILED, -EIO, "STATUS_PURGE_FAILED"},
+	{STATUS_CRED_REQUIRES_CONFIRMATION, -EIO,
+	"STATUS_CRED_REQUIRES_CONFIRMATION"},
+	{STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE, -EIO,
+	"STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE"},
+	{STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER, -EIO,
+	"STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER"},
+	{STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE, -EIO,
+	"STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE"},
+	{STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE, -EIO,
+	"STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE"},
+	{STATUS_CS_ENCRYPTION_FILE_NOT_CSE, -EIO,
+	"STATUS_CS_ENCRYPTION_FILE_NOT_CSE"},
+	{STATUS_INVALID_LABEL, -EIO, "STATUS_INVALID_LABEL"},
+	{STATUS_DRIVER_PROCESS_TERMINATED, -EIO,
+	"STATUS_DRIVER_PROCESS_TERMINATED"},
+	{STATUS_AMBIGUOUS_SYSTEM_DEVICE, -EIO,
+	"STATUS_AMBIGUOUS_SYSTEM_DEVICE"},
+	{STATUS_SYSTEM_DEVICE_NOT_FOUND, -EIO,
+	"STATUS_SYSTEM_DEVICE_NOT_FOUND"},
+	{STATUS_RESTART_BOOT_APPLICATION, -EIO,
+	"STATUS_RESTART_BOOT_APPLICATION"},
+	{STATUS_INVALID_TASK_NAME, -EIO, "STATUS_INVALID_TASK_NAME"},
+	{STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"},
+	{STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"},
+	{STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"},
+	{STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"},
+	{STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"},
+	{STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"},
+	{STATUS_REQUEST_CANCELED, -EIO, "STATUS_REQUEST_CANCELED"},
+	{STATUS_RECURSIVE_DISPATCH, -EIO, "STATUS_RECURSIVE_DISPATCH"},
+	{STATUS_LPC_RECEIVE_BUFFER_EXPECTED, -EIO,
+	"STATUS_LPC_RECEIVE_BUFFER_EXPECTED"},
+	{STATUS_LPC_INVALID_CONNECTION_USAGE, -EIO,
+	"STATUS_LPC_INVALID_CONNECTION_USAGE"},
+	{STATUS_LPC_REQUESTS_NOT_ALLOWED, -EIO,
+	"STATUS_LPC_REQUESTS_NOT_ALLOWED"},
+	{STATUS_RESOURCE_IN_USE, -EIO, "STATUS_RESOURCE_IN_USE"},
+	{STATUS_HARDWARE_MEMORY_ERROR, -EIO, "STATUS_HARDWARE_MEMORY_ERROR"},
+	{STATUS_THREADPOOL_HANDLE_EXCEPTION, -EIO,
+	"STATUS_THREADPOOL_HANDLE_EXCEPTION"},
+	{STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED, -EIO,
+	"STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED"},
+	{STATUS_THREADPOOL_RELEASED_DURING_OPERATION, -EIO,
+	"STATUS_THREADPOOL_RELEASED_DURING_OPERATION"},
+	{STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING, -EIO,
+	"STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING"},
+	{STATUS_APC_RETURNED_WHILE_IMPERSONATING, -EIO,
+	"STATUS_APC_RETURNED_WHILE_IMPERSONATING"},
+	{STATUS_PROCESS_IS_PROTECTED, -EIO, "STATUS_PROCESS_IS_PROTECTED"},
+	{STATUS_MCA_EXCEPTION, -EIO, "STATUS_MCA_EXCEPTION"},
+	{STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE, -EIO,
+	"STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE"},
+	{STATUS_SYMLINK_CLASS_DISABLED, -EIO, "STATUS_SYMLINK_CLASS_DISABLED"},
+	{STATUS_INVALID_IDN_NORMALIZATION, -EIO,
+	"STATUS_INVALID_IDN_NORMALIZATION"},
+	{STATUS_NO_UNICODE_TRANSLATION, -EIO, "STATUS_NO_UNICODE_TRANSLATION"},
+	{STATUS_ALREADY_REGISTERED, -EIO, "STATUS_ALREADY_REGISTERED"},
+	{STATUS_CONTEXT_MISMATCH, -EIO, "STATUS_CONTEXT_MISMATCH"},
+	{STATUS_PORT_ALREADY_HAS_COMPLETION_LIST, -EIO,
+	"STATUS_PORT_ALREADY_HAS_COMPLETION_LIST"},
+	{STATUS_CALLBACK_RETURNED_THREAD_PRIORITY, -EIO,
+	"STATUS_CALLBACK_RETURNED_THREAD_PRIORITY"},
+	{STATUS_INVALID_THREAD, -EIO, "STATUS_INVALID_THREAD"},
+	{STATUS_CALLBACK_RETURNED_TRANSACTION, -EIO,
+	"STATUS_CALLBACK_RETURNED_TRANSACTION"},
+	{STATUS_CALLBACK_RETURNED_LDR_LOCK, -EIO,
+	"STATUS_CALLBACK_RETURNED_LDR_LOCK"},
+	{STATUS_CALLBACK_RETURNED_LANG, -EIO, "STATUS_CALLBACK_RETURNED_LANG"},
+	{STATUS_CALLBACK_RETURNED_PRI_BACK, -EIO,
+	"STATUS_CALLBACK_RETURNED_PRI_BACK"},
+	{STATUS_CALLBACK_RETURNED_THREAD_AFFINITY, -EIO,
+	"STATUS_CALLBACK_RETURNED_THREAD_AFFINITY"},
+	{STATUS_DISK_REPAIR_DISABLED, -EIO, "STATUS_DISK_REPAIR_DISABLED"},
+	{STATUS_DS_DOMAIN_RENAME_IN_PROGRESS, -EIO,
+	"STATUS_DS_DOMAIN_RENAME_IN_PROGRESS"},
+	{STATUS_DISK_QUOTA_EXCEEDED, -EDQUOT, "STATUS_DISK_QUOTA_EXCEEDED"},
+	{STATUS_CONTENT_BLOCKED, -EIO, "STATUS_CONTENT_BLOCKED"},
+	{STATUS_BAD_CLUSTERS, -EIO, "STATUS_BAD_CLUSTERS"},
+	{STATUS_VOLUME_DIRTY, -EIO, "STATUS_VOLUME_DIRTY"},
+	{STATUS_FILE_CHECKED_OUT, -EIO, "STATUS_FILE_CHECKED_OUT"},
+	{STATUS_CHECKOUT_REQUIRED, -EIO, "STATUS_CHECKOUT_REQUIRED"},
+	{STATUS_BAD_FILE_TYPE, -EIO, "STATUS_BAD_FILE_TYPE"},
+	{STATUS_FILE_TOO_LARGE, -EIO, "STATUS_FILE_TOO_LARGE"},
+	{STATUS_FORMS_AUTH_REQUIRED, -EIO, "STATUS_FORMS_AUTH_REQUIRED"},
+	{STATUS_VIRUS_INFECTED, -EIO, "STATUS_VIRUS_INFECTED"},
+	{STATUS_VIRUS_DELETED, -EIO, "STATUS_VIRUS_DELETED"},
+	{STATUS_BAD_MCFG_TABLE, -EIO, "STATUS_BAD_MCFG_TABLE"},
+	{STATUS_WOW_ASSERTION, -EIO, "STATUS_WOW_ASSERTION"},
+	{STATUS_INVALID_SIGNATURE, -EIO, "STATUS_INVALID_SIGNATURE"},
+	{STATUS_HMAC_NOT_SUPPORTED, -EIO, "STATUS_HMAC_NOT_SUPPORTED"},
+	{STATUS_IPSEC_QUEUE_OVERFLOW, -EIO, "STATUS_IPSEC_QUEUE_OVERFLOW"},
+	{STATUS_ND_QUEUE_OVERFLOW, -EIO, "STATUS_ND_QUEUE_OVERFLOW"},
+	{STATUS_HOPLIMIT_EXCEEDED, -EIO, "STATUS_HOPLIMIT_EXCEEDED"},
+	{STATUS_PROTOCOL_NOT_SUPPORTED, -EOPNOTSUPP,
+	"STATUS_PROTOCOL_NOT_SUPPORTED"},
+	{STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED, -EIO,
+	"STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED"},
+	{STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR, -EIO,
+	"STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR"},
+	{STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR, -EIO,
+	"STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR"},
+	{STATUS_XML_PARSE_ERROR, -EIO, "STATUS_XML_PARSE_ERROR"},
+	{STATUS_XMLDSIG_ERROR, -EIO, "STATUS_XMLDSIG_ERROR"},
+	{STATUS_WRONG_COMPARTMENT, -EIO, "STATUS_WRONG_COMPARTMENT"},
+	{STATUS_AUTHIP_FAILURE, -EIO, "STATUS_AUTHIP_FAILURE"},
+	{DBG_NO_STATE_CHANGE, -EIO, "DBG_NO_STATE_CHANGE"},
+	{DBG_APP_NOT_IDLE, -EIO, "DBG_APP_NOT_IDLE"},
+	{RPC_NT_INVALID_STRING_BINDING, -EIO, "RPC_NT_INVALID_STRING_BINDING"},
+	{RPC_NT_WRONG_KIND_OF_BINDING, -EIO, "RPC_NT_WRONG_KIND_OF_BINDING"},
+	{RPC_NT_INVALID_BINDING, -EIO, "RPC_NT_INVALID_BINDING"},
+	{RPC_NT_PROTSEQ_NOT_SUPPORTED, -EOPNOTSUPP,
+	"RPC_NT_PROTSEQ_NOT_SUPPORTED"},
+	{RPC_NT_INVALID_RPC_PROTSEQ, -EIO, "RPC_NT_INVALID_RPC_PROTSEQ"},
+	{RPC_NT_INVALID_STRING_UUID, -EIO, "RPC_NT_INVALID_STRING_UUID"},
+	{RPC_NT_INVALID_ENDPOINT_FORMAT, -EIO,
+	"RPC_NT_INVALID_ENDPOINT_FORMAT"},
+	{RPC_NT_INVALID_NET_ADDR, -EIO, "RPC_NT_INVALID_NET_ADDR"},
+	{RPC_NT_NO_ENDPOINT_FOUND, -EIO, "RPC_NT_NO_ENDPOINT_FOUND"},
+	{RPC_NT_INVALID_TIMEOUT, -EINVAL, "RPC_NT_INVALID_TIMEOUT"},
+	{RPC_NT_OBJECT_NOT_FOUND, -ENOENT, "RPC_NT_OBJECT_NOT_FOUND"},
+	{RPC_NT_ALREADY_REGISTERED, -EIO, "RPC_NT_ALREADY_REGISTERED"},
+	{RPC_NT_TYPE_ALREADY_REGISTERED, -EIO,
+	"RPC_NT_TYPE_ALREADY_REGISTERED"},
+	{RPC_NT_ALREADY_LISTENING, -EIO, "RPC_NT_ALREADY_LISTENING"},
+	{RPC_NT_NO_PROTSEQS_REGISTERED, -EIO, "RPC_NT_NO_PROTSEQS_REGISTERED"},
+	{RPC_NT_NOT_LISTENING, -EIO, "RPC_NT_NOT_LISTENING"},
+	{RPC_NT_UNKNOWN_MGR_TYPE, -EIO, "RPC_NT_UNKNOWN_MGR_TYPE"},
+	{RPC_NT_UNKNOWN_IF, -EIO, "RPC_NT_UNKNOWN_IF"},
+	{RPC_NT_NO_BINDINGS, -EIO, "RPC_NT_NO_BINDINGS"},
+	{RPC_NT_NO_PROTSEQS, -EIO, "RPC_NT_NO_PROTSEQS"},
+	{RPC_NT_CANT_CREATE_ENDPOINT, -EIO, "RPC_NT_CANT_CREATE_ENDPOINT"},
+	{RPC_NT_OUT_OF_RESOURCES, -EIO, "RPC_NT_OUT_OF_RESOURCES"},
+	{RPC_NT_SERVER_UNAVAILABLE, -EIO, "RPC_NT_SERVER_UNAVAILABLE"},
+	{RPC_NT_SERVER_TOO_BUSY, -EBUSY, "RPC_NT_SERVER_TOO_BUSY"},
+	{RPC_NT_INVALID_NETWORK_OPTIONS, -EIO,
+	"RPC_NT_INVALID_NETWORK_OPTIONS"},
+	{RPC_NT_NO_CALL_ACTIVE, -EIO, "RPC_NT_NO_CALL_ACTIVE"},
+	{RPC_NT_CALL_FAILED, -EIO, "RPC_NT_CALL_FAILED"},
+	{RPC_NT_CALL_FAILED_DNE, -EIO, "RPC_NT_CALL_FAILED_DNE"},
+	{RPC_NT_PROTOCOL_ERROR, -EIO, "RPC_NT_PROTOCOL_ERROR"},
+	{RPC_NT_UNSUPPORTED_TRANS_SYN, -EIO, "RPC_NT_UNSUPPORTED_TRANS_SYN"},
+	{RPC_NT_UNSUPPORTED_TYPE, -EIO, "RPC_NT_UNSUPPORTED_TYPE"},
+	{RPC_NT_INVALID_TAG, -EIO, "RPC_NT_INVALID_TAG"},
+	{RPC_NT_INVALID_BOUND, -EIO, "RPC_NT_INVALID_BOUND"},
+	{RPC_NT_NO_ENTRY_NAME, -EIO, "RPC_NT_NO_ENTRY_NAME"},
+	{RPC_NT_INVALID_NAME_SYNTAX, -EIO, "RPC_NT_INVALID_NAME_SYNTAX"},
+	{RPC_NT_UNSUPPORTED_NAME_SYNTAX, -EIO,
+	"RPC_NT_UNSUPPORTED_NAME_SYNTAX"},
+	{RPC_NT_UUID_NO_ADDRESS, -EIO, "RPC_NT_UUID_NO_ADDRESS"},
+	{RPC_NT_DUPLICATE_ENDPOINT, -ENOTUNIQ, "RPC_NT_DUPLICATE_ENDPOINT"},
+	{RPC_NT_UNKNOWN_AUTHN_TYPE, -EIO, "RPC_NT_UNKNOWN_AUTHN_TYPE"},
+	{RPC_NT_MAX_CALLS_TOO_SMALL, -EIO, "RPC_NT_MAX_CALLS_TOO_SMALL"},
+	{RPC_NT_STRING_TOO_LONG, -EIO, "RPC_NT_STRING_TOO_LONG"},
+	{RPC_NT_PROTSEQ_NOT_FOUND, -EIO, "RPC_NT_PROTSEQ_NOT_FOUND"},
+	{RPC_NT_PROCNUM_OUT_OF_RANGE, -EIO, "RPC_NT_PROCNUM_OUT_OF_RANGE"},
+	{RPC_NT_BINDING_HAS_NO_AUTH, -EIO, "RPC_NT_BINDING_HAS_NO_AUTH"},
+	{RPC_NT_UNKNOWN_AUTHN_SERVICE, -EIO, "RPC_NT_UNKNOWN_AUTHN_SERVICE"},
+	{RPC_NT_UNKNOWN_AUTHN_LEVEL, -EIO, "RPC_NT_UNKNOWN_AUTHN_LEVEL"},
+	{RPC_NT_INVALID_AUTH_IDENTITY, -EIO, "RPC_NT_INVALID_AUTH_IDENTITY"},
+	{RPC_NT_UNKNOWN_AUTHZ_SERVICE, -EIO, "RPC_NT_UNKNOWN_AUTHZ_SERVICE"},
+	{EPT_NT_INVALID_ENTRY, -EIO, "EPT_NT_INVALID_ENTRY"},
+	{EPT_NT_CANT_PERFORM_OP, -EIO, "EPT_NT_CANT_PERFORM_OP"},
+	{EPT_NT_NOT_REGISTERED, -EIO, "EPT_NT_NOT_REGISTERED"},
+	{RPC_NT_NOTHING_TO_EXPORT, -EIO, "RPC_NT_NOTHING_TO_EXPORT"},
+	{RPC_NT_INCOMPLETE_NAME, -EIO, "RPC_NT_INCOMPLETE_NAME"},
+	{RPC_NT_INVALID_VERS_OPTION, -EIO, "RPC_NT_INVALID_VERS_OPTION"},
+	{RPC_NT_NO_MORE_MEMBERS, -EIO, "RPC_NT_NO_MORE_MEMBERS"},
+	{RPC_NT_NOT_ALL_OBJS_UNEXPORTED, -EIO,
+	"RPC_NT_NOT_ALL_OBJS_UNEXPORTED"},
+	{RPC_NT_INTERFACE_NOT_FOUND, -EIO, "RPC_NT_INTERFACE_NOT_FOUND"},
+	{RPC_NT_ENTRY_ALREADY_EXISTS, -EIO, "RPC_NT_ENTRY_ALREADY_EXISTS"},
+	{RPC_NT_ENTRY_NOT_FOUND, -EIO, "RPC_NT_ENTRY_NOT_FOUND"},
+	{RPC_NT_NAME_SERVICE_UNAVAILABLE, -EIO,
+	"RPC_NT_NAME_SERVICE_UNAVAILABLE"},
+	{RPC_NT_INVALID_NAF_ID, -EIO, "RPC_NT_INVALID_NAF_ID"},
+	{RPC_NT_CANNOT_SUPPORT, -EOPNOTSUPP, "RPC_NT_CANNOT_SUPPORT"},
+	{RPC_NT_NO_CONTEXT_AVAILABLE, -EIO, "RPC_NT_NO_CONTEXT_AVAILABLE"},
+	{RPC_NT_INTERNAL_ERROR, -EIO, "RPC_NT_INTERNAL_ERROR"},
+	{RPC_NT_ZERO_DIVIDE, -EIO, "RPC_NT_ZERO_DIVIDE"},
+	{RPC_NT_ADDRESS_ERROR, -EIO, "RPC_NT_ADDRESS_ERROR"},
+	{RPC_NT_FP_DIV_ZERO, -EIO, "RPC_NT_FP_DIV_ZERO"},
+	{RPC_NT_FP_UNDERFLOW, -EIO, "RPC_NT_FP_UNDERFLOW"},
+	{RPC_NT_FP_OVERFLOW, -EIO, "RPC_NT_FP_OVERFLOW"},
+	{RPC_NT_CALL_IN_PROGRESS, -EIO, "RPC_NT_CALL_IN_PROGRESS"},
+	{RPC_NT_NO_MORE_BINDINGS, -EIO, "RPC_NT_NO_MORE_BINDINGS"},
+	{RPC_NT_GROUP_MEMBER_NOT_FOUND, -EIO, "RPC_NT_GROUP_MEMBER_NOT_FOUND"},
+	{EPT_NT_CANT_CREATE, -EIO, "EPT_NT_CANT_CREATE"},
+	{RPC_NT_INVALID_OBJECT, -EIO, "RPC_NT_INVALID_OBJECT"},
+	{RPC_NT_NO_INTERFACES, -EIO, "RPC_NT_NO_INTERFACES"},
+	{RPC_NT_CALL_CANCELLED, -EIO, "RPC_NT_CALL_CANCELLED"},
+	{RPC_NT_BINDING_INCOMPLETE, -EIO, "RPC_NT_BINDING_INCOMPLETE"},
+	{RPC_NT_COMM_FAILURE, -EIO, "RPC_NT_COMM_FAILURE"},
+	{RPC_NT_UNSUPPORTED_AUTHN_LEVEL, -EIO,
+	"RPC_NT_UNSUPPORTED_AUTHN_LEVEL"},
+	{RPC_NT_NO_PRINC_NAME, -EIO, "RPC_NT_NO_PRINC_NAME"},
+	{RPC_NT_NOT_RPC_ERROR, -EIO, "RPC_NT_NOT_RPC_ERROR"},
+	{RPC_NT_SEC_PKG_ERROR, -EIO, "RPC_NT_SEC_PKG_ERROR"},
+	{RPC_NT_NOT_CANCELLED, -EIO, "RPC_NT_NOT_CANCELLED"},
+	{RPC_NT_INVALID_ASYNC_HANDLE, -EIO, "RPC_NT_INVALID_ASYNC_HANDLE"},
+	{RPC_NT_INVALID_ASYNC_CALL, -EIO, "RPC_NT_INVALID_ASYNC_CALL"},
+	{RPC_NT_PROXY_ACCESS_DENIED, -EACCES, "RPC_NT_PROXY_ACCESS_DENIED"},
+	{RPC_NT_NO_MORE_ENTRIES, -EIO, "RPC_NT_NO_MORE_ENTRIES"},
+	{RPC_NT_SS_CHAR_TRANS_OPEN_FAIL, -EIO,
+	"RPC_NT_SS_CHAR_TRANS_OPEN_FAIL"},
+	{RPC_NT_SS_CHAR_TRANS_SHORT_FILE, -EIO,
+	"RPC_NT_SS_CHAR_TRANS_SHORT_FILE"},
+	{RPC_NT_SS_IN_NULL_CONTEXT, -EIO, "RPC_NT_SS_IN_NULL_CONTEXT"},
+	{RPC_NT_SS_CONTEXT_MISMATCH, -EIO, "RPC_NT_SS_CONTEXT_MISMATCH"},
+	{RPC_NT_SS_CONTEXT_DAMAGED, -EIO, "RPC_NT_SS_CONTEXT_DAMAGED"},
+	{RPC_NT_SS_HANDLES_MISMATCH, -EIO, "RPC_NT_SS_HANDLES_MISMATCH"},
+	{RPC_NT_SS_CANNOT_GET_CALL_HANDLE, -EIO,
+	"RPC_NT_SS_CANNOT_GET_CALL_HANDLE"},
+	{RPC_NT_NULL_REF_POINTER, -EIO, "RPC_NT_NULL_REF_POINTER"},
+	{RPC_NT_ENUM_VALUE_OUT_OF_RANGE, -EIO,
+	"RPC_NT_ENUM_VALUE_OUT_OF_RANGE"},
+	{RPC_NT_BYTE_COUNT_TOO_SMALL, -EIO, "RPC_NT_BYTE_COUNT_TOO_SMALL"},
+	{RPC_NT_BAD_STUB_DATA, -EIO, "RPC_NT_BAD_STUB_DATA"},
+	{RPC_NT_INVALID_ES_ACTION, -EIO, "RPC_NT_INVALID_ES_ACTION"},
+	{RPC_NT_WRONG_ES_VERSION, -EIO, "RPC_NT_WRONG_ES_VERSION"},
+	{RPC_NT_WRONG_STUB_VERSION, -EIO, "RPC_NT_WRONG_STUB_VERSION"},
+	{RPC_NT_INVALID_PIPE_OBJECT, -EIO, "RPC_NT_INVALID_PIPE_OBJECT"},
+	{RPC_NT_INVALID_PIPE_OPERATION, -EIO, "RPC_NT_INVALID_PIPE_OPERATION"},
+	{RPC_NT_WRONG_PIPE_VERSION, -EIO, "RPC_NT_WRONG_PIPE_VERSION"},
+	{RPC_NT_PIPE_CLOSED, -EIO, "RPC_NT_PIPE_CLOSED"},
+	{RPC_NT_PIPE_DISCIPLINE_ERROR, -EIO, "RPC_NT_PIPE_DISCIPLINE_ERROR"},
+	{RPC_NT_PIPE_EMPTY, -EIO, "RPC_NT_PIPE_EMPTY"},
+	{STATUS_PNP_BAD_MPS_TABLE, -EIO, "STATUS_PNP_BAD_MPS_TABLE"},
+	{STATUS_PNP_TRANSLATION_FAILED, -EIO, "STATUS_PNP_TRANSLATION_FAILED"},
+	{STATUS_PNP_IRQ_TRANSLATION_FAILED, -EIO,
+	"STATUS_PNP_IRQ_TRANSLATION_FAILED"},
+	{STATUS_PNP_INVALID_ID, -EIO, "STATUS_PNP_INVALID_ID"},
+	{STATUS_IO_REISSUE_AS_CACHED, -EIO, "STATUS_IO_REISSUE_AS_CACHED"},
+	{STATUS_CTX_WINSTATION_NAME_INVALID, -EIO,
+	"STATUS_CTX_WINSTATION_NAME_INVALID"},
+	{STATUS_CTX_INVALID_PD, -EIO, "STATUS_CTX_INVALID_PD"},
+	{STATUS_CTX_PD_NOT_FOUND, -EIO, "STATUS_CTX_PD_NOT_FOUND"},
+	{STATUS_CTX_CLOSE_PENDING, -EIO, "STATUS_CTX_CLOSE_PENDING"},
+	{STATUS_CTX_NO_OUTBUF, -EIO, "STATUS_CTX_NO_OUTBUF"},
+	{STATUS_CTX_MODEM_INF_NOT_FOUND, -EIO,
+	"STATUS_CTX_MODEM_INF_NOT_FOUND"},
+	{STATUS_CTX_INVALID_MODEMNAME, -EIO, "STATUS_CTX_INVALID_MODEMNAME"},
+	{STATUS_CTX_RESPONSE_ERROR, -EIO, "STATUS_CTX_RESPONSE_ERROR"},
+	{STATUS_CTX_MODEM_RESPONSE_TIMEOUT, -ETIMEDOUT,
+	"STATUS_CTX_MODEM_RESPONSE_TIMEOUT"},
+	{STATUS_CTX_MODEM_RESPONSE_NO_CARRIER, -EIO,
+	"STATUS_CTX_MODEM_RESPONSE_NO_CARRIER"},
+	{STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE, -EIO,
+	"STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE"},
+	{STATUS_CTX_MODEM_RESPONSE_BUSY, -EBUSY,
+	"STATUS_CTX_MODEM_RESPONSE_BUSY"},
+	{STATUS_CTX_MODEM_RESPONSE_VOICE, -EIO,
+	"STATUS_CTX_MODEM_RESPONSE_VOICE"},
+	{STATUS_CTX_TD_ERROR, -EIO, "STATUS_CTX_TD_ERROR"},
+	{STATUS_CTX_LICENSE_CLIENT_INVALID, -EIO,
+	"STATUS_CTX_LICENSE_CLIENT_INVALID"},
+	{STATUS_CTX_LICENSE_NOT_AVAILABLE, -EIO,
+	"STATUS_CTX_LICENSE_NOT_AVAILABLE"},
+	{STATUS_CTX_LICENSE_EXPIRED, -EIO, "STATUS_CTX_LICENSE_EXPIRED"},
+	{STATUS_CTX_WINSTATION_NOT_FOUND, -EIO,
+	"STATUS_CTX_WINSTATION_NOT_FOUND"},
+	{STATUS_CTX_WINSTATION_NAME_COLLISION, -EIO,
+	"STATUS_CTX_WINSTATION_NAME_COLLISION"},
+	{STATUS_CTX_WINSTATION_BUSY, -EBUSY, "STATUS_CTX_WINSTATION_BUSY"},
+	{STATUS_CTX_BAD_VIDEO_MODE, -EIO, "STATUS_CTX_BAD_VIDEO_MODE"},
+	{STATUS_CTX_GRAPHICS_INVALID, -EIO, "STATUS_CTX_GRAPHICS_INVALID"},
+	{STATUS_CTX_NOT_CONSOLE, -EIO, "STATUS_CTX_NOT_CONSOLE"},
+	{STATUS_CTX_CLIENT_QUERY_TIMEOUT, -EIO,
+	"STATUS_CTX_CLIENT_QUERY_TIMEOUT"},
+	{STATUS_CTX_CONSOLE_DISCONNECT, -EIO, "STATUS_CTX_CONSOLE_DISCONNECT"},
+	{STATUS_CTX_CONSOLE_CONNECT, -EIO, "STATUS_CTX_CONSOLE_CONNECT"},
+	{STATUS_CTX_SHADOW_DENIED, -EIO, "STATUS_CTX_SHADOW_DENIED"},
+	{STATUS_CTX_WINSTATION_ACCESS_DENIED, -EACCES,
+	"STATUS_CTX_WINSTATION_ACCESS_DENIED"},
+	{STATUS_CTX_INVALID_WD, -EIO, "STATUS_CTX_INVALID_WD"},
+	{STATUS_CTX_WD_NOT_FOUND, -EIO, "STATUS_CTX_WD_NOT_FOUND"},
+	{STATUS_CTX_SHADOW_INVALID, -EIO, "STATUS_CTX_SHADOW_INVALID"},
+	{STATUS_CTX_SHADOW_DISABLED, -EIO, "STATUS_CTX_SHADOW_DISABLED"},
+	{STATUS_RDP_PROTOCOL_ERROR, -EIO, "STATUS_RDP_PROTOCOL_ERROR"},
+	{STATUS_CTX_CLIENT_LICENSE_NOT_SET, -EIO,
+	"STATUS_CTX_CLIENT_LICENSE_NOT_SET"},
+	{STATUS_CTX_CLIENT_LICENSE_IN_USE, -EIO,
+	"STATUS_CTX_CLIENT_LICENSE_IN_USE"},
+	{STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE, -EIO,
+	"STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE"},
+	{STATUS_CTX_SHADOW_NOT_RUNNING, -EIO, "STATUS_CTX_SHADOW_NOT_RUNNING"},
+	{STATUS_CTX_LOGON_DISABLED, -EIO, "STATUS_CTX_LOGON_DISABLED"},
+	{STATUS_CTX_SECURITY_LAYER_ERROR, -EIO,
+	"STATUS_CTX_SECURITY_LAYER_ERROR"},
+	{STATUS_TS_INCOMPATIBLE_SESSIONS, -EIO,
+	"STATUS_TS_INCOMPATIBLE_SESSIONS"},
+	{STATUS_MUI_FILE_NOT_FOUND, -EIO, "STATUS_MUI_FILE_NOT_FOUND"},
+	{STATUS_MUI_INVALID_FILE, -EIO, "STATUS_MUI_INVALID_FILE"},
+	{STATUS_MUI_INVALID_RC_CONFIG, -EIO, "STATUS_MUI_INVALID_RC_CONFIG"},
+	{STATUS_MUI_INVALID_LOCALE_NAME, -EIO,
+	"STATUS_MUI_INVALID_LOCALE_NAME"},
+	{STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME, -EIO,
+	"STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME"},
+	{STATUS_MUI_FILE_NOT_LOADED, -EIO, "STATUS_MUI_FILE_NOT_LOADED"},
+	{STATUS_RESOURCE_ENUM_USER_STOP, -EIO,
+	"STATUS_RESOURCE_ENUM_USER_STOP"},
+	{STATUS_CLUSTER_INVALID_NODE, -EIO, "STATUS_CLUSTER_INVALID_NODE"},
+	{STATUS_CLUSTER_NODE_EXISTS, -EIO, "STATUS_CLUSTER_NODE_EXISTS"},
+	{STATUS_CLUSTER_JOIN_IN_PROGRESS, -EIO,
+	"STATUS_CLUSTER_JOIN_IN_PROGRESS"},
+	{STATUS_CLUSTER_NODE_NOT_FOUND, -EIO, "STATUS_CLUSTER_NODE_NOT_FOUND"},
+	{STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND, -EIO,
+	"STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND"},
+	{STATUS_CLUSTER_NETWORK_EXISTS, -EIO, "STATUS_CLUSTER_NETWORK_EXISTS"},
+	{STATUS_CLUSTER_NETWORK_NOT_FOUND, -EIO,
+	"STATUS_CLUSTER_NETWORK_NOT_FOUND"},
+	{STATUS_CLUSTER_NETINTERFACE_EXISTS, -EIO,
+	"STATUS_CLUSTER_NETINTERFACE_EXISTS"},
+	{STATUS_CLUSTER_NETINTERFACE_NOT_FOUND, -EIO,
+	"STATUS_CLUSTER_NETINTERFACE_NOT_FOUND"},
+	{STATUS_CLUSTER_INVALID_REQUEST, -EIO,
+	"STATUS_CLUSTER_INVALID_REQUEST"},
+	{STATUS_CLUSTER_INVALID_NETWORK_PROVIDER, -EIO,
+	"STATUS_CLUSTER_INVALID_NETWORK_PROVIDER"},
+	{STATUS_CLUSTER_NODE_DOWN, -EIO, "STATUS_CLUSTER_NODE_DOWN"},
+	{STATUS_CLUSTER_NODE_UNREACHABLE, -EIO,
+	"STATUS_CLUSTER_NODE_UNREACHABLE"},
+	{STATUS_CLUSTER_NODE_NOT_MEMBER, -EIO,
+	"STATUS_CLUSTER_NODE_NOT_MEMBER"},
+	{STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS, -EIO,
+	"STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS"},
+	{STATUS_CLUSTER_INVALID_NETWORK, -EIO,
+	"STATUS_CLUSTER_INVALID_NETWORK"},
+	{STATUS_CLUSTER_NO_NET_ADAPTERS, -EIO,
+	"STATUS_CLUSTER_NO_NET_ADAPTERS"},
+	{STATUS_CLUSTER_NODE_UP, -EIO, "STATUS_CLUSTER_NODE_UP"},
+	{STATUS_CLUSTER_NODE_PAUSED, -EIO, "STATUS_CLUSTER_NODE_PAUSED"},
+	{STATUS_CLUSTER_NODE_NOT_PAUSED, -EIO,
+	"STATUS_CLUSTER_NODE_NOT_PAUSED"},
+	{STATUS_CLUSTER_NO_SECURITY_CONTEXT, -EIO,
+	"STATUS_CLUSTER_NO_SECURITY_CONTEXT"},
+	{STATUS_CLUSTER_NETWORK_NOT_INTERNAL, -EIO,
+	"STATUS_CLUSTER_NETWORK_NOT_INTERNAL"},
+	{STATUS_CLUSTER_POISONED, -EIO, "STATUS_CLUSTER_POISONED"},
+	{STATUS_ACPI_INVALID_OPCODE, -EIO, "STATUS_ACPI_INVALID_OPCODE"},
+	{STATUS_ACPI_STACK_OVERFLOW, -EIO, "STATUS_ACPI_STACK_OVERFLOW"},
+	{STATUS_ACPI_ASSERT_FAILED, -EIO, "STATUS_ACPI_ASSERT_FAILED"},
+	{STATUS_ACPI_INVALID_INDEX, -EIO, "STATUS_ACPI_INVALID_INDEX"},
+	{STATUS_ACPI_INVALID_ARGUMENT, -EIO, "STATUS_ACPI_INVALID_ARGUMENT"},
+	{STATUS_ACPI_FATAL, -EIO, "STATUS_ACPI_FATAL"},
+	{STATUS_ACPI_INVALID_SUPERNAME, -EIO, "STATUS_ACPI_INVALID_SUPERNAME"},
+	{STATUS_ACPI_INVALID_ARGTYPE, -EIO, "STATUS_ACPI_INVALID_ARGTYPE"},
+	{STATUS_ACPI_INVALID_OBJTYPE, -EIO, "STATUS_ACPI_INVALID_OBJTYPE"},
+	{STATUS_ACPI_INVALID_TARGETTYPE, -EIO,
+	"STATUS_ACPI_INVALID_TARGETTYPE"},
+	{STATUS_ACPI_INCORRECT_ARGUMENT_COUNT, -EIO,
+	"STATUS_ACPI_INCORRECT_ARGUMENT_COUNT"},
+	{STATUS_ACPI_ADDRESS_NOT_MAPPED, -EIO,
+	"STATUS_ACPI_ADDRESS_NOT_MAPPED"},
+	{STATUS_ACPI_INVALID_EVENTTYPE, -EIO, "STATUS_ACPI_INVALID_EVENTTYPE"},
+	{STATUS_ACPI_HANDLER_COLLISION, -EIO, "STATUS_ACPI_HANDLER_COLLISION"},
+	{STATUS_ACPI_INVALID_DATA, -EIO, "STATUS_ACPI_INVALID_DATA"},
+	{STATUS_ACPI_INVALID_REGION, -EIO, "STATUS_ACPI_INVALID_REGION"},
+	{STATUS_ACPI_INVALID_ACCESS_SIZE, -EIO,
+	"STATUS_ACPI_INVALID_ACCESS_SIZE"},
+	{STATUS_ACPI_ACQUIRE_GLOBAL_LOCK, -EIO,
+	"STATUS_ACPI_ACQUIRE_GLOBAL_LOCK"},
+	{STATUS_ACPI_ALREADY_INITIALIZED, -EIO,
+	"STATUS_ACPI_ALREADY_INITIALIZED"},
+	{STATUS_ACPI_NOT_INITIALIZED, -EIO, "STATUS_ACPI_NOT_INITIALIZED"},
+	{STATUS_ACPI_INVALID_MUTEX_LEVEL, -EIO,
+	"STATUS_ACPI_INVALID_MUTEX_LEVEL"},
+	{STATUS_ACPI_MUTEX_NOT_OWNED, -EIO, "STATUS_ACPI_MUTEX_NOT_OWNED"},
+	{STATUS_ACPI_MUTEX_NOT_OWNER, -EIO, "STATUS_ACPI_MUTEX_NOT_OWNER"},
+	{STATUS_ACPI_RS_ACCESS, -EIO, "STATUS_ACPI_RS_ACCESS"},
+	{STATUS_ACPI_INVALID_TABLE, -EIO, "STATUS_ACPI_INVALID_TABLE"},
+	{STATUS_ACPI_REG_HANDLER_FAILED, -EIO,
+	"STATUS_ACPI_REG_HANDLER_FAILED"},
+	{STATUS_ACPI_POWER_REQUEST_FAILED, -EIO,
+	"STATUS_ACPI_POWER_REQUEST_FAILED"},
+	{STATUS_SXS_SECTION_NOT_FOUND, -EIO, "STATUS_SXS_SECTION_NOT_FOUND"},
+	{STATUS_SXS_CANT_GEN_ACTCTX, -EIO, "STATUS_SXS_CANT_GEN_ACTCTX"},
+	{STATUS_SXS_INVALID_ACTCTXDATA_FORMAT, -EIO,
+	"STATUS_SXS_INVALID_ACTCTXDATA_FORMAT"},
+	{STATUS_SXS_ASSEMBLY_NOT_FOUND, -EIO, "STATUS_SXS_ASSEMBLY_NOT_FOUND"},
+	{STATUS_SXS_MANIFEST_FORMAT_ERROR, -EIO,
+	"STATUS_SXS_MANIFEST_FORMAT_ERROR"},
+	{STATUS_SXS_MANIFEST_PARSE_ERROR, -EIO,
+	"STATUS_SXS_MANIFEST_PARSE_ERROR"},
+	{STATUS_SXS_ACTIVATION_CONTEXT_DISABLED, -EIO,
+	"STATUS_SXS_ACTIVATION_CONTEXT_DISABLED"},
+	{STATUS_SXS_KEY_NOT_FOUND, -EIO, "STATUS_SXS_KEY_NOT_FOUND"},
+	{STATUS_SXS_VERSION_CONFLICT, -EIO, "STATUS_SXS_VERSION_CONFLICT"},
+	{STATUS_SXS_WRONG_SECTION_TYPE, -EIO, "STATUS_SXS_WRONG_SECTION_TYPE"},
+	{STATUS_SXS_THREAD_QUERIES_DISABLED, -EIO,
+	"STATUS_SXS_THREAD_QUERIES_DISABLED"},
+	{STATUS_SXS_ASSEMBLY_MISSING, -EIO, "STATUS_SXS_ASSEMBLY_MISSING"},
+	{STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET, -EIO,
+	"STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET"},
+	{STATUS_SXS_EARLY_DEACTIVATION, -EIO, "STATUS_SXS_EARLY_DEACTIVATION"},
+	{STATUS_SXS_INVALID_DEACTIVATION, -EIO,
+	"STATUS_SXS_INVALID_DEACTIVATION"},
+	{STATUS_SXS_MULTIPLE_DEACTIVATION, -EIO,
+	"STATUS_SXS_MULTIPLE_DEACTIVATION"},
+	{STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY, -EIO,
+	"STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY"},
+	{STATUS_SXS_PROCESS_TERMINATION_REQUESTED, -EIO,
+	"STATUS_SXS_PROCESS_TERMINATION_REQUESTED"},
+	{STATUS_SXS_CORRUPT_ACTIVATION_STACK, -EIO,
+	"STATUS_SXS_CORRUPT_ACTIVATION_STACK"},
+	{STATUS_SXS_CORRUPTION, -EIO, "STATUS_SXS_CORRUPTION"},
+	{STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE, -EIO,
+	"STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE"},
+	{STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME, -EIO,
+	"STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME"},
+	{STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE, -EIO,
+	"STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE"},
+	{STATUS_SXS_IDENTITY_PARSE_ERROR, -EIO,
+	"STATUS_SXS_IDENTITY_PARSE_ERROR"},
+	{STATUS_SXS_COMPONENT_STORE_CORRUPT, -EIO,
+	"STATUS_SXS_COMPONENT_STORE_CORRUPT"},
+	{STATUS_SXS_FILE_HASH_MISMATCH, -EIO, "STATUS_SXS_FILE_HASH_MISMATCH"},
+	{STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT, -EIO,
+	"STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT"},
+	{STATUS_SXS_IDENTITIES_DIFFERENT, -EIO,
+	"STATUS_SXS_IDENTITIES_DIFFERENT"},
+	{STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT, -EIO,
+	"STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT"},
+	{STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY, -EIO,
+	"STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY"},
+	{STATUS_ADVANCED_INSTALLER_FAILED, -EIO,
+	"STATUS_ADVANCED_INSTALLER_FAILED"},
+	{STATUS_XML_ENCODING_MISMATCH, -EIO, "STATUS_XML_ENCODING_MISMATCH"},
+	{STATUS_SXS_MANIFEST_TOO_BIG, -EIO, "STATUS_SXS_MANIFEST_TOO_BIG"},
+	{STATUS_SXS_SETTING_NOT_REGISTERED, -EIO,
+	"STATUS_SXS_SETTING_NOT_REGISTERED"},
+	{STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE, -EIO,
+	"STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE"},
+	{STATUS_SMI_PRIMITIVE_INSTALLER_FAILED, -EIO,
+	"STATUS_SMI_PRIMITIVE_INSTALLER_FAILED"},
+	{STATUS_GENERIC_COMMAND_FAILED, -EIO, "STATUS_GENERIC_COMMAND_FAILED"},
+	{STATUS_SXS_FILE_HASH_MISSING, -EIO, "STATUS_SXS_FILE_HASH_MISSING"},
+	{STATUS_TRANSACTIONAL_CONFLICT, -EIO, "STATUS_TRANSACTIONAL_CONFLICT"},
+	{STATUS_INVALID_TRANSACTION, -EIO, "STATUS_INVALID_TRANSACTION"},
+	{STATUS_TRANSACTION_NOT_ACTIVE, -EIO, "STATUS_TRANSACTION_NOT_ACTIVE"},
+	{STATUS_TM_INITIALIZATION_FAILED, -EIO,
+	"STATUS_TM_INITIALIZATION_FAILED"},
+	{STATUS_RM_NOT_ACTIVE, -EIO, "STATUS_RM_NOT_ACTIVE"},
+	{STATUS_RM_METADATA_CORRUPT, -EIO, "STATUS_RM_METADATA_CORRUPT"},
+	{STATUS_TRANSACTION_NOT_JOINED, -EIO, "STATUS_TRANSACTION_NOT_JOINED"},
+	{STATUS_DIRECTORY_NOT_RM, -EIO, "STATUS_DIRECTORY_NOT_RM"},
+	{STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE, -EIO,
+	"STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE"},
+	{STATUS_LOG_RESIZE_INVALID_SIZE, -EIO,
+	"STATUS_LOG_RESIZE_INVALID_SIZE"},
+	{STATUS_REMOTE_FILE_VERSION_MISMATCH, -EIO,
+	"STATUS_REMOTE_FILE_VERSION_MISMATCH"},
+	{STATUS_CRM_PROTOCOL_ALREADY_EXISTS, -EIO,
+	"STATUS_CRM_PROTOCOL_ALREADY_EXISTS"},
+	{STATUS_TRANSACTION_PROPAGATION_FAILED, -EIO,
+	"STATUS_TRANSACTION_PROPAGATION_FAILED"},
+	{STATUS_CRM_PROTOCOL_NOT_FOUND, -EIO, "STATUS_CRM_PROTOCOL_NOT_FOUND"},
+	{STATUS_TRANSACTION_SUPERIOR_EXISTS, -EIO,
+	"STATUS_TRANSACTION_SUPERIOR_EXISTS"},
+	{STATUS_TRANSACTION_REQUEST_NOT_VALID, -EIO,
+	"STATUS_TRANSACTION_REQUEST_NOT_VALID"},
+	{STATUS_TRANSACTION_NOT_REQUESTED, -EIO,
+	"STATUS_TRANSACTION_NOT_REQUESTED"},
+	{STATUS_TRANSACTION_ALREADY_ABORTED, -EIO,
+	"STATUS_TRANSACTION_ALREADY_ABORTED"},
+	{STATUS_TRANSACTION_ALREADY_COMMITTED, -EIO,
+	"STATUS_TRANSACTION_ALREADY_COMMITTED"},
+	{STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER, -EIO,
+	"STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER"},
+	{STATUS_CURRENT_TRANSACTION_NOT_VALID, -EIO,
+	"STATUS_CURRENT_TRANSACTION_NOT_VALID"},
+	{STATUS_LOG_GROWTH_FAILED, -EIO, "STATUS_LOG_GROWTH_FAILED"},
+	{STATUS_OBJECT_NO_LONGER_EXISTS, -EIO,
+	"STATUS_OBJECT_NO_LONGER_EXISTS"},
+	{STATUS_STREAM_MINIVERSION_NOT_FOUND, -EIO,
+	"STATUS_STREAM_MINIVERSION_NOT_FOUND"},
+	{STATUS_STREAM_MINIVERSION_NOT_VALID, -EIO,
+	"STATUS_STREAM_MINIVERSION_NOT_VALID"},
+	{STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION, -EIO,
+	"STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION"},
+	{STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT, -EIO,
+	"STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT"},
+	{STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS, -EIO,
+	"STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS"},
+	{STATUS_HANDLE_NO_LONGER_VALID, -EIO, "STATUS_HANDLE_NO_LONGER_VALID"},
+	{STATUS_LOG_CORRUPTION_DETECTED, -EIO,
+	"STATUS_LOG_CORRUPTION_DETECTED"},
+	{STATUS_RM_DISCONNECTED, -EIO, "STATUS_RM_DISCONNECTED"},
+	{STATUS_ENLISTMENT_NOT_SUPERIOR, -EIO,
+	"STATUS_ENLISTMENT_NOT_SUPERIOR"},
+	{STATUS_FILE_IDENTITY_NOT_PERSISTENT, -EIO,
+	"STATUS_FILE_IDENTITY_NOT_PERSISTENT"},
+	{STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY, -EIO,
+	"STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY"},
+	{STATUS_CANT_CROSS_RM_BOUNDARY, -EIO, "STATUS_CANT_CROSS_RM_BOUNDARY"},
+	{STATUS_TXF_DIR_NOT_EMPTY, -EIO, "STATUS_TXF_DIR_NOT_EMPTY"},
+	{STATUS_INDOUBT_TRANSACTIONS_EXIST, -EIO,
+	"STATUS_INDOUBT_TRANSACTIONS_EXIST"},
+	{STATUS_TM_VOLATILE, -EIO, "STATUS_TM_VOLATILE"},
+	{STATUS_ROLLBACK_TIMER_EXPIRED, -EIO, "STATUS_ROLLBACK_TIMER_EXPIRED"},
+	{STATUS_TXF_ATTRIBUTE_CORRUPT, -EIO, "STATUS_TXF_ATTRIBUTE_CORRUPT"},
+	{STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION, -EIO,
+	"STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION"},
+	{STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED, -EIO,
+	"STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED"},
+	{STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE, -EIO,
+	"STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE"},
+	{STATUS_TRANSACTION_REQUIRED_PROMOTION, -EIO,
+	"STATUS_TRANSACTION_REQUIRED_PROMOTION"},
+	{STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION, -EIO,
+	"STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION"},
+	{STATUS_TRANSACTIONS_NOT_FROZEN, -EIO,
+	"STATUS_TRANSACTIONS_NOT_FROZEN"},
+	{STATUS_TRANSACTION_FREEZE_IN_PROGRESS, -EIO,
+	"STATUS_TRANSACTION_FREEZE_IN_PROGRESS"},
+	{STATUS_NOT_SNAPSHOT_VOLUME, -EIO, "STATUS_NOT_SNAPSHOT_VOLUME"},
+	{STATUS_NO_SAVEPOINT_WITH_OPEN_FILES, -EIO,
+	"STATUS_NO_SAVEPOINT_WITH_OPEN_FILES"},
+	{STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION, -EIO,
+	"STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION"},
+	{STATUS_TM_IDENTITY_MISMATCH, -EIO, "STATUS_TM_IDENTITY_MISMATCH"},
+	{STATUS_FLOATED_SECTION, -EIO, "STATUS_FLOATED_SECTION"},
+	{STATUS_CANNOT_ACCEPT_TRANSACTED_WORK, -EIO,
+	"STATUS_CANNOT_ACCEPT_TRANSACTED_WORK"},
+	{STATUS_CANNOT_ABORT_TRANSACTIONS, -EIO,
+	"STATUS_CANNOT_ABORT_TRANSACTIONS"},
+	{STATUS_TRANSACTION_NOT_FOUND, -EIO, "STATUS_TRANSACTION_NOT_FOUND"},
+	{STATUS_RESOURCEMANAGER_NOT_FOUND, -EIO,
+	"STATUS_RESOURCEMANAGER_NOT_FOUND"},
+	{STATUS_ENLISTMENT_NOT_FOUND, -EIO, "STATUS_ENLISTMENT_NOT_FOUND"},
+	{STATUS_TRANSACTIONMANAGER_NOT_FOUND, -EIO,
+	"STATUS_TRANSACTIONMANAGER_NOT_FOUND"},
+	{STATUS_TRANSACTIONMANAGER_NOT_ONLINE, -EIO,
+	"STATUS_TRANSACTIONMANAGER_NOT_ONLINE"},
+	{STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION, -EIO,
+	"STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION"},
+	{STATUS_TRANSACTION_NOT_ROOT, -EIO, "STATUS_TRANSACTION_NOT_ROOT"},
+	{STATUS_TRANSACTION_OBJECT_EXPIRED, -EIO,
+	"STATUS_TRANSACTION_OBJECT_EXPIRED"},
+	{STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION, -EIO,
+	"STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION"},
+	{STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED, -EIO,
+	"STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED"},
+	{STATUS_TRANSACTION_RECORD_TOO_LONG, -EIO,
+	"STATUS_TRANSACTION_RECORD_TOO_LONG"},
+	{STATUS_NO_LINK_TRACKING_IN_TRANSACTION, -EIO,
+	"STATUS_NO_LINK_TRACKING_IN_TRANSACTION"},
+	{STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION, -EOPNOTSUPP,
+	"STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION"},
+	{STATUS_TRANSACTION_INTEGRITY_VIOLATED, -EIO,
+	"STATUS_TRANSACTION_INTEGRITY_VIOLATED"},
+	{STATUS_LOG_SECTOR_INVALID, -EIO, "STATUS_LOG_SECTOR_INVALID"},
+	{STATUS_LOG_SECTOR_PARITY_INVALID, -EIO,
+	"STATUS_LOG_SECTOR_PARITY_INVALID"},
+	{STATUS_LOG_SECTOR_REMAPPED, -EIO, "STATUS_LOG_SECTOR_REMAPPED"},
+	{STATUS_LOG_BLOCK_INCOMPLETE, -EIO, "STATUS_LOG_BLOCK_INCOMPLETE"},
+	{STATUS_LOG_INVALID_RANGE, -EIO, "STATUS_LOG_INVALID_RANGE"},
+	{STATUS_LOG_BLOCKS_EXHAUSTED, -EIO, "STATUS_LOG_BLOCKS_EXHAUSTED"},
+	{STATUS_LOG_READ_CONTEXT_INVALID, -EIO,
+	"STATUS_LOG_READ_CONTEXT_INVALID"},
+	{STATUS_LOG_RESTART_INVALID, -EIO, "STATUS_LOG_RESTART_INVALID"},
+	{STATUS_LOG_BLOCK_VERSION, -EIO, "STATUS_LOG_BLOCK_VERSION"},
+	{STATUS_LOG_BLOCK_INVALID, -EIO, "STATUS_LOG_BLOCK_INVALID"},
+	{STATUS_LOG_READ_MODE_INVALID, -EIO, "STATUS_LOG_READ_MODE_INVALID"},
+	{STATUS_LOG_METADATA_CORRUPT, -EIO, "STATUS_LOG_METADATA_CORRUPT"},
+	{STATUS_LOG_METADATA_INVALID, -EIO, "STATUS_LOG_METADATA_INVALID"},
+	{STATUS_LOG_METADATA_INCONSISTENT, -EIO,
+	"STATUS_LOG_METADATA_INCONSISTENT"},
+	{STATUS_LOG_RESERVATION_INVALID, -EIO,
+	"STATUS_LOG_RESERVATION_INVALID"},
+	{STATUS_LOG_CANT_DELETE, -EIO, "STATUS_LOG_CANT_DELETE"},
+	{STATUS_LOG_CONTAINER_LIMIT_EXCEEDED, -EIO,
+	"STATUS_LOG_CONTAINER_LIMIT_EXCEEDED"},
+	{STATUS_LOG_START_OF_LOG, -EIO, "STATUS_LOG_START_OF_LOG"},
+	{STATUS_LOG_POLICY_ALREADY_INSTALLED, -EIO,
+	"STATUS_LOG_POLICY_ALREADY_INSTALLED"},
+	{STATUS_LOG_POLICY_NOT_INSTALLED, -EIO,
+	"STATUS_LOG_POLICY_NOT_INSTALLED"},
+	{STATUS_LOG_POLICY_INVALID, -EIO, "STATUS_LOG_POLICY_INVALID"},
+	{STATUS_LOG_POLICY_CONFLICT, -EIO, "STATUS_LOG_POLICY_CONFLICT"},
+	{STATUS_LOG_PINNED_ARCHIVE_TAIL, -EIO,
+	"STATUS_LOG_PINNED_ARCHIVE_TAIL"},
+	{STATUS_LOG_RECORD_NONEXISTENT, -EIO, "STATUS_LOG_RECORD_NONEXISTENT"},
+	{STATUS_LOG_RECORDS_RESERVED_INVALID, -EIO,
+	"STATUS_LOG_RECORDS_RESERVED_INVALID"},
+	{STATUS_LOG_SPACE_RESERVED_INVALID, -EIO,
+	"STATUS_LOG_SPACE_RESERVED_INVALID"},
+	{STATUS_LOG_TAIL_INVALID, -EIO, "STATUS_LOG_TAIL_INVALID"},
+	{STATUS_LOG_FULL, -EIO, "STATUS_LOG_FULL"},
+	{STATUS_LOG_MULTIPLEXED, -EIO, "STATUS_LOG_MULTIPLEXED"},
+	{STATUS_LOG_DEDICATED, -EIO, "STATUS_LOG_DEDICATED"},
+	{STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS, -EIO,
+	"STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS"},
+	{STATUS_LOG_ARCHIVE_IN_PROGRESS, -EIO,
+	"STATUS_LOG_ARCHIVE_IN_PROGRESS"},
+	{STATUS_LOG_EPHEMERAL, -EIO, "STATUS_LOG_EPHEMERAL"},
+	{STATUS_LOG_NOT_ENOUGH_CONTAINERS, -EIO,
+	"STATUS_LOG_NOT_ENOUGH_CONTAINERS"},
+	{STATUS_LOG_CLIENT_ALREADY_REGISTERED, -EIO,
+	"STATUS_LOG_CLIENT_ALREADY_REGISTERED"},
+	{STATUS_LOG_CLIENT_NOT_REGISTERED, -EIO,
+	"STATUS_LOG_CLIENT_NOT_REGISTERED"},
+	{STATUS_LOG_FULL_HANDLER_IN_PROGRESS, -EIO,
+	"STATUS_LOG_FULL_HANDLER_IN_PROGRESS"},
+	{STATUS_LOG_CONTAINER_READ_FAILED, -EIO,
+	"STATUS_LOG_CONTAINER_READ_FAILED"},
+	{STATUS_LOG_CONTAINER_WRITE_FAILED, -EIO,
+	"STATUS_LOG_CONTAINER_WRITE_FAILED"},
+	{STATUS_LOG_CONTAINER_OPEN_FAILED, -EIO,
+	"STATUS_LOG_CONTAINER_OPEN_FAILED"},
+	{STATUS_LOG_CONTAINER_STATE_INVALID, -EIO,
+	"STATUS_LOG_CONTAINER_STATE_INVALID"},
+	{STATUS_LOG_STATE_INVALID, -EIO, "STATUS_LOG_STATE_INVALID"},
+	{STATUS_LOG_PINNED, -EIO, "STATUS_LOG_PINNED"},
+	{STATUS_LOG_METADATA_FLUSH_FAILED, -EIO,
+	"STATUS_LOG_METADATA_FLUSH_FAILED"},
+	{STATUS_LOG_INCONSISTENT_SECURITY, -EIO,
+	"STATUS_LOG_INCONSISTENT_SECURITY"},
+	{STATUS_LOG_APPENDED_FLUSH_FAILED, -EIO,
+	"STATUS_LOG_APPENDED_FLUSH_FAILED"},
+	{STATUS_LOG_PINNED_RESERVATION, -EIO, "STATUS_LOG_PINNED_RESERVATION"},
+	{STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD, -EIO,
+	"STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD"},
+	{STATUS_FLT_NO_HANDLER_DEFINED, -EIO, "STATUS_FLT_NO_HANDLER_DEFINED"},
+	{STATUS_FLT_CONTEXT_ALREADY_DEFINED, -EIO,
+	"STATUS_FLT_CONTEXT_ALREADY_DEFINED"},
+	{STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST, -EIO,
+	"STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST"},
+	{STATUS_FLT_DISALLOW_FAST_IO, -EIO, "STATUS_FLT_DISALLOW_FAST_IO"},
+	{STATUS_FLT_INVALID_NAME_REQUEST, -EIO,
+	"STATUS_FLT_INVALID_NAME_REQUEST"},
+	{STATUS_FLT_NOT_SAFE_TO_POST_OPERATION, -EIO,
+	"STATUS_FLT_NOT_SAFE_TO_POST_OPERATION"},
+	{STATUS_FLT_NOT_INITIALIZED, -EIO, "STATUS_FLT_NOT_INITIALIZED"},
+	{STATUS_FLT_FILTER_NOT_READY, -EIO, "STATUS_FLT_FILTER_NOT_READY"},
+	{STATUS_FLT_POST_OPERATION_CLEANUP, -EIO,
+	"STATUS_FLT_POST_OPERATION_CLEANUP"},
+	{STATUS_FLT_INTERNAL_ERROR, -EIO, "STATUS_FLT_INTERNAL_ERROR"},
+	{STATUS_FLT_DELETING_OBJECT, -EIO, "STATUS_FLT_DELETING_OBJECT"},
+	{STATUS_FLT_MUST_BE_NONPAGED_POOL, -EIO,
+	"STATUS_FLT_MUST_BE_NONPAGED_POOL"},
+	{STATUS_FLT_DUPLICATE_ENTRY, -EIO, "STATUS_FLT_DUPLICATE_ENTRY"},
+	{STATUS_FLT_CBDQ_DISABLED, -EIO, "STATUS_FLT_CBDQ_DISABLED"},
+	{STATUS_FLT_DO_NOT_ATTACH, -EIO, "STATUS_FLT_DO_NOT_ATTACH"},
+	{STATUS_FLT_DO_NOT_DETACH, -EIO, "STATUS_FLT_DO_NOT_DETACH"},
+	{STATUS_FLT_INSTANCE_ALTITUDE_COLLISION, -EIO,
+	"STATUS_FLT_INSTANCE_ALTITUDE_COLLISION"},
+	{STATUS_FLT_INSTANCE_NAME_COLLISION, -EIO,
+	"STATUS_FLT_INSTANCE_NAME_COLLISION"},
+	{STATUS_FLT_FILTER_NOT_FOUND, -EIO, "STATUS_FLT_FILTER_NOT_FOUND"},
+	{STATUS_FLT_VOLUME_NOT_FOUND, -EIO, "STATUS_FLT_VOLUME_NOT_FOUND"},
+	{STATUS_FLT_INSTANCE_NOT_FOUND, -EIO, "STATUS_FLT_INSTANCE_NOT_FOUND"},
+	{STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND, -EIO,
+	"STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND"},
+	{STATUS_FLT_INVALID_CONTEXT_REGISTRATION, -EIO,
+	"STATUS_FLT_INVALID_CONTEXT_REGISTRATION"},
+	{STATUS_FLT_NAME_CACHE_MISS, -EIO, "STATUS_FLT_NAME_CACHE_MISS"},
+	{STATUS_FLT_NO_DEVICE_OBJECT, -EIO, "STATUS_FLT_NO_DEVICE_OBJECT"},
+	{STATUS_FLT_VOLUME_ALREADY_MOUNTED, -EIO,
+	"STATUS_FLT_VOLUME_ALREADY_MOUNTED"},
+	{STATUS_FLT_ALREADY_ENLISTED, -EIO, "STATUS_FLT_ALREADY_ENLISTED"},
+	{STATUS_FLT_CONTEXT_ALREADY_LINKED, -EIO,
+	"STATUS_FLT_CONTEXT_ALREADY_LINKED"},
+	{STATUS_FLT_NO_WAITER_FOR_REPLY, -EIO,
+	"STATUS_FLT_NO_WAITER_FOR_REPLY"},
+	{STATUS_MONITOR_NO_DESCRIPTOR, -EIO, "STATUS_MONITOR_NO_DESCRIPTOR"},
+	{STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT, -EIO,
+	"STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT"},
+	{STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM, -EIO,
+	"STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM"},
+	{STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK"},
+	{STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED, -EIO,
+	"STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED"},
+	{STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK"},
+	{STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK"},
+	{STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA, -EIO,
+	"STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA"},
+	{STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK, -EIO,
+	"STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK"},
+	{STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER, -EIO,
+	"STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER"},
+	{STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER, -EIO,
+	"STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER"},
+	{STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER, -EIO,
+	"STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER"},
+	{STATUS_GRAPHICS_ADAPTER_WAS_RESET, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_WAS_RESET"},
+	{STATUS_GRAPHICS_INVALID_DRIVER_MODEL, -EIO,
+	"STATUS_GRAPHICS_INVALID_DRIVER_MODEL"},
+	{STATUS_GRAPHICS_PRESENT_MODE_CHANGED, -EIO,
+	"STATUS_GRAPHICS_PRESENT_MODE_CHANGED"},
+	{STATUS_GRAPHICS_PRESENT_OCCLUDED, -EIO,
+	"STATUS_GRAPHICS_PRESENT_OCCLUDED"},
+	{STATUS_GRAPHICS_PRESENT_DENIED, -EIO,
+	"STATUS_GRAPHICS_PRESENT_DENIED"},
+	{STATUS_GRAPHICS_CANNOTCOLORCONVERT, -EIO,
+	"STATUS_GRAPHICS_CANNOTCOLORCONVERT"},
+	{STATUS_GRAPHICS_NO_VIDEO_MEMORY, -EIO,
+	"STATUS_GRAPHICS_NO_VIDEO_MEMORY"},
+	{STATUS_GRAPHICS_CANT_LOCK_MEMORY, -EIO,
+	"STATUS_GRAPHICS_CANT_LOCK_MEMORY"},
+	{STATUS_GRAPHICS_ALLOCATION_BUSY, -EBUSY,
+	"STATUS_GRAPHICS_ALLOCATION_BUSY"},
+	{STATUS_GRAPHICS_TOO_MANY_REFERENCES, -EIO,
+	"STATUS_GRAPHICS_TOO_MANY_REFERENCES"},
+	{STATUS_GRAPHICS_TRY_AGAIN_LATER, -EIO,
+	"STATUS_GRAPHICS_TRY_AGAIN_LATER"},
+	{STATUS_GRAPHICS_TRY_AGAIN_NOW, -EIO, "STATUS_GRAPHICS_TRY_AGAIN_NOW"},
+	{STATUS_GRAPHICS_ALLOCATION_INVALID, -EIO,
+	"STATUS_GRAPHICS_ALLOCATION_INVALID"},
+	{STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE, -EIO,
+	"STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE"},
+	{STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED, -EIO,
+	"STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED"},
+	{STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION, -EIO,
+	"STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION"},
+	{STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE, -EIO,
+	"STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE"},
+	{STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION, -EIO,
+	"STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION"},
+	{STATUS_GRAPHICS_ALLOCATION_CLOSED, -EIO,
+	"STATUS_GRAPHICS_ALLOCATION_CLOSED"},
+	{STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE, -EIO,
+	"STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE"},
+	{STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE, -EIO,
+	"STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE"},
+	{STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE, -EIO,
+	"STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE"},
+	{STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST, -EIO,
+	"STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST"},
+	{STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE, -EIO,
+	"STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY"},
+	{STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_VIDPN, -EIO, "STATUS_GRAPHICS_INVALID_VIDPN"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET"},
+	{STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET"},
+	{STATUS_GRAPHICS_INVALID_FREQUENCY, -EIO,
+	"STATUS_GRAPHICS_INVALID_FREQUENCY"},
+	{STATUS_GRAPHICS_INVALID_ACTIVE_REGION, -EIO,
+	"STATUS_GRAPHICS_INVALID_ACTIVE_REGION"},
+	{STATUS_GRAPHICS_INVALID_TOTAL_REGION, -EIO,
+	"STATUS_GRAPHICS_INVALID_TOTAL_REGION"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE"},
+	{STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE"},
+	{STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET, -EIO,
+	"STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET"},
+	{STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET, -EIO,
+	"STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET"},
+	{STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET"},
+	{STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET"},
+	{STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_TARGET_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_TARGET_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH"},
+	{STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE"},
+	{STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET, -EIO,
+	"STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET"},
+	{STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_STALE_MODESET, -EIO, "STATUS_GRAPHICS_STALE_MODESET"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE"},
+	{STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN, -EIO,
+	"STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN"},
+	{STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION, -EIO,
+	"STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION"},
+	{STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES, -EIO,
+	"STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES"},
+	{STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE"},
+	{STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET"},
+	{STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET"},
+	{STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR"},
+	{STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET, -EIO,
+	"STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET"},
+	{STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET, -EIO,
+	"STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET"},
+	{STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE"},
+	{STATUS_GRAPHICS_RESOURCES_NOT_RELATED, -EIO,
+	"STATUS_GRAPHICS_RESOURCES_NOT_RELATED"},
+	{STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE, -EIO,
+	"STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE"},
+	{STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET, -EIO,
+	"STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET"},
+	{STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER, -EIO,
+	"STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER"},
+	{STATUS_GRAPHICS_NO_VIDPNMGR, -EIO, "STATUS_GRAPHICS_NO_VIDPNMGR"},
+	{STATUS_GRAPHICS_NO_ACTIVE_VIDPN, -EIO,
+	"STATUS_GRAPHICS_NO_ACTIVE_VIDPN"},
+	{STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY"},
+	{STATUS_GRAPHICS_MONITOR_NOT_CONNECTED, -EIO,
+	"STATUS_GRAPHICS_MONITOR_NOT_CONNECTED"},
+	{STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE"},
+	{STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE, -EIO,
+	"STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE"},
+	{STATUS_GRAPHICS_INVALID_STRIDE, -EIO,
+	"STATUS_GRAPHICS_INVALID_STRIDE"},
+	{STATUS_GRAPHICS_INVALID_PIXELFORMAT, -EIO,
+	"STATUS_GRAPHICS_INVALID_PIXELFORMAT"},
+	{STATUS_GRAPHICS_INVALID_COLORBASIS, -EIO,
+	"STATUS_GRAPHICS_INVALID_COLORBASIS"},
+	{STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE"},
+	{STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY, -EIO,
+	"STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY"},
+	{STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT, -EIO,
+	"STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT"},
+	{STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE, -EIO,
+	"STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE"},
+	{STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN, -EIO,
+	"STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN"},
+	{STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL, -EIO,
+	"STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL"},
+	{STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION, -EIO,
+	"STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION"},
+	{STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED,
+	-EIO,
+	"STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_GAMMA_RAMP, -EIO,
+	"STATUS_GRAPHICS_INVALID_GAMMA_RAMP"},
+	{STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_MODE_NOT_IN_MODESET, -EIO,
+	"STATUS_GRAPHICS_MODE_NOT_IN_MODESET"},
+	{STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON, -EIO,
+	"STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON"},
+	{STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE"},
+	{STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE"},
+	{STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS, -EIO,
+	"STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS"},
+	{STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING, -EIO,
+	"STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING"},
+	{STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED, -EIO,
+	"STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED"},
+	{STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS, -EIO,
+	"STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS"},
+	{STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT, -EIO,
+	"STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT"},
+	{STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM, -EIO,
+	"STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN"},
+	{STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT, -EIO,
+	"STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT"},
+	{STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED, -EIO,
+	"STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED"},
+	{STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION, -EIO,
+	"STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION"},
+	{STATUS_GRAPHICS_INVALID_CLIENT_TYPE, -EIO,
+	"STATUS_GRAPHICS_INVALID_CLIENT_TYPE"},
+	{STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET, -EIO,
+	"STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET"},
+	{STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED, -EIO,
+	"STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED"},
+	{STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER, -EIO,
+	"STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER"},
+	{STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED, -EIO,
+	"STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED"},
+	{STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED, -EIO,
+	"STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED"},
+	{STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY"},
+	{STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED, -EIO,
+	"STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED"},
+	{STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON, -EIO,
+	"STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON"},
+	{STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE, -EIO,
+	"STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE"},
+	{STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER, -EIO,
+	"STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER"},
+	{STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED, -EIO,
+	"STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED"},
+	{STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS,
+	-EIO,
+	"STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS"},
+	{STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST"},
+	{STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS, -EIO,
+	"STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS"},
+	{STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST"},
+	{STATUS_GRAPHICS_OPM_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_OPM_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_COPP_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_COPP_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_UAB_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_UAB_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS"},
+	{STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL, -EIO,
+	"STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL"},
+	{STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST, -EIO,
+	"STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST"},
+	{STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME, -EIO,
+	"STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME"},
+	{STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP, -EIO,
+	"STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP"},
+	{STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_OPM_INVALID_POINTER, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_POINTER"},
+	{STATUS_GRAPHICS_OPM_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_OPM_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_OPM_INVALID_HANDLE, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_HANDLE"},
+	{STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE, -EIO,
+	"STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE"},
+	{STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH, -EIO,
+	"STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH"},
+	{STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED, -EIO,
+	"STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED"},
+	{STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED, -EIO,
+	"STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED"},
+	{STATUS_GRAPHICS_PVP_HFS_FAILED, -EIO,
+	"STATUS_GRAPHICS_PVP_HFS_FAILED"},
+	{STATUS_GRAPHICS_OPM_INVALID_SRM, -EIO,
+	"STATUS_GRAPHICS_OPM_INVALID_SRM"},
+	{STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP, -EIO,
+	"STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP"},
+	{STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP, -EIO,
+	"STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP"},
+	{STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA, -EIO,
+	"STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA"},
+	{STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET, -EIO,
+	"STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET"},
+	{STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH, -EIO,
+	"STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH"},
+	{STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE, -EIO,
+	"STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE"},
+	{STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS, -EIO,
+	"STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS"},
+	{STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS, -EIO,
+	"STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS"},
+	{STATUS_GRAPHICS_I2C_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_I2C_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST, -EIO,
+	"STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST"},
+	{STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA, -EIO,
+	"STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA"},
+	{STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA, -EIO,
+	"STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA"},
+	{STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_DATA, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_DATA"},
+	{STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE,
+	-EIO,
+	"STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING"},
+	{STATUS_GRAPHICS_MCA_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_MCA_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH"},
+	{STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM, -EIO,
+	"STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM"},
+	{STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE, -EIO,
+	"STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE"},
+	{STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS, -EIO,
+	"STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS"},
+	{STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED"},
+	{STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME, -EIO,
+	"STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME"},
+	{STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP, -EIO,
+	"STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP"},
+	{STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED, -EIO,
+	"STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED"},
+	{STATUS_GRAPHICS_INVALID_POINTER, -EIO,
+	"STATUS_GRAPHICS_INVALID_POINTER"},
+	{STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE, -EIO,
+	"STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE"},
+	{STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL, -EIO,
+	"STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL"},
+	{STATUS_GRAPHICS_INTERNAL_ERROR, -EIO,
+	"STATUS_GRAPHICS_INTERNAL_ERROR"},
+	{STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS, -EIO,
+	"STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS"},
+	{STATUS_FVE_LOCKED_VOLUME, -EIO, "STATUS_FVE_LOCKED_VOLUME"},
+	{STATUS_FVE_NOT_ENCRYPTED, -EIO, "STATUS_FVE_NOT_ENCRYPTED"},
+	{STATUS_FVE_BAD_INFORMATION, -EIO, "STATUS_FVE_BAD_INFORMATION"},
+	{STATUS_FVE_TOO_SMALL, -EIO, "STATUS_FVE_TOO_SMALL"},
+	{STATUS_FVE_FAILED_WRONG_FS, -EIO, "STATUS_FVE_FAILED_WRONG_FS"},
+	{STATUS_FVE_FAILED_BAD_FS, -EIO, "STATUS_FVE_FAILED_BAD_FS"},
+	{STATUS_FVE_FS_NOT_EXTENDED, -EIO, "STATUS_FVE_FS_NOT_EXTENDED"},
+	{STATUS_FVE_FS_MOUNTED, -EIO, "STATUS_FVE_FS_MOUNTED"},
+	{STATUS_FVE_NO_LICENSE, -EIO, "STATUS_FVE_NO_LICENSE"},
+	{STATUS_FVE_ACTION_NOT_ALLOWED, -EIO, "STATUS_FVE_ACTION_NOT_ALLOWED"},
+	{STATUS_FVE_BAD_DATA, -EIO, "STATUS_FVE_BAD_DATA"},
+	{STATUS_FVE_VOLUME_NOT_BOUND, -EIO, "STATUS_FVE_VOLUME_NOT_BOUND"},
+	{STATUS_FVE_NOT_DATA_VOLUME, -EIO, "STATUS_FVE_NOT_DATA_VOLUME"},
+	{STATUS_FVE_CONV_READ_ERROR, -EIO, "STATUS_FVE_CONV_READ_ERROR"},
+	{STATUS_FVE_CONV_WRITE_ERROR, -EIO, "STATUS_FVE_CONV_WRITE_ERROR"},
+	{STATUS_FVE_OVERLAPPED_UPDATE, -EIO, "STATUS_FVE_OVERLAPPED_UPDATE"},
+	{STATUS_FVE_FAILED_SECTOR_SIZE, -EIO, "STATUS_FVE_FAILED_SECTOR_SIZE"},
+	{STATUS_FVE_FAILED_AUTHENTICATION, -EIO,
+	"STATUS_FVE_FAILED_AUTHENTICATION"},
+	{STATUS_FVE_NOT_OS_VOLUME, -EIO, "STATUS_FVE_NOT_OS_VOLUME"},
+	{STATUS_FVE_KEYFILE_NOT_FOUND, -EIO, "STATUS_FVE_KEYFILE_NOT_FOUND"},
+	{STATUS_FVE_KEYFILE_INVALID, -EIO, "STATUS_FVE_KEYFILE_INVALID"},
+	{STATUS_FVE_KEYFILE_NO_VMK, -EIO, "STATUS_FVE_KEYFILE_NO_VMK"},
+	{STATUS_FVE_TPM_DISABLED, -EIO, "STATUS_FVE_TPM_DISABLED"},
+	{STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO, -EIO,
+	"STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO"},
+	{STATUS_FVE_TPM_INVALID_PCR, -EIO, "STATUS_FVE_TPM_INVALID_PCR"},
+	{STATUS_FVE_TPM_NO_VMK, -EIO, "STATUS_FVE_TPM_NO_VMK"},
+	{STATUS_FVE_PIN_INVALID, -EIO, "STATUS_FVE_PIN_INVALID"},
+	{STATUS_FVE_AUTH_INVALID_APPLICATION, -EIO,
+	"STATUS_FVE_AUTH_INVALID_APPLICATION"},
+	{STATUS_FVE_AUTH_INVALID_CONFIG, -EIO,
+	"STATUS_FVE_AUTH_INVALID_CONFIG"},
+	{STATUS_FVE_DEBUGGER_ENABLED, -EIO, "STATUS_FVE_DEBUGGER_ENABLED"},
+	{STATUS_FVE_DRY_RUN_FAILED, -EIO, "STATUS_FVE_DRY_RUN_FAILED"},
+	{STATUS_FVE_BAD_METADATA_POINTER, -EIO,
+	"STATUS_FVE_BAD_METADATA_POINTER"},
+	{STATUS_FVE_OLD_METADATA_COPY, -EIO, "STATUS_FVE_OLD_METADATA_COPY"},
+	{STATUS_FVE_REBOOT_REQUIRED, -EIO, "STATUS_FVE_REBOOT_REQUIRED"},
+	{STATUS_FVE_RAW_ACCESS, -EIO, "STATUS_FVE_RAW_ACCESS"},
+	{STATUS_FVE_RAW_BLOCKED, -EIO, "STATUS_FVE_RAW_BLOCKED"},
+	{STATUS_FWP_CALLOUT_NOT_FOUND, -EIO, "STATUS_FWP_CALLOUT_NOT_FOUND"},
+	{STATUS_FWP_CONDITION_NOT_FOUND, -EIO,
+	"STATUS_FWP_CONDITION_NOT_FOUND"},
+	{STATUS_FWP_FILTER_NOT_FOUND, -EIO, "STATUS_FWP_FILTER_NOT_FOUND"},
+	{STATUS_FWP_LAYER_NOT_FOUND, -EIO, "STATUS_FWP_LAYER_NOT_FOUND"},
+	{STATUS_FWP_PROVIDER_NOT_FOUND, -EIO, "STATUS_FWP_PROVIDER_NOT_FOUND"},
+	{STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND, -EIO,
+	"STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND"},
+	{STATUS_FWP_SUBLAYER_NOT_FOUND, -EIO, "STATUS_FWP_SUBLAYER_NOT_FOUND"},
+	{STATUS_FWP_NOT_FOUND, -EIO, "STATUS_FWP_NOT_FOUND"},
+	{STATUS_FWP_ALREADY_EXISTS, -EIO, "STATUS_FWP_ALREADY_EXISTS"},
+	{STATUS_FWP_IN_USE, -EIO, "STATUS_FWP_IN_USE"},
+	{STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS, -EIO,
+	"STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS"},
+	{STATUS_FWP_WRONG_SESSION, -EIO, "STATUS_FWP_WRONG_SESSION"},
+	{STATUS_FWP_NO_TXN_IN_PROGRESS, -EIO, "STATUS_FWP_NO_TXN_IN_PROGRESS"},
+	{STATUS_FWP_TXN_IN_PROGRESS, -EIO, "STATUS_FWP_TXN_IN_PROGRESS"},
+	{STATUS_FWP_TXN_ABORTED, -EIO, "STATUS_FWP_TXN_ABORTED"},
+	{STATUS_FWP_SESSION_ABORTED, -EIO, "STATUS_FWP_SESSION_ABORTED"},
+	{STATUS_FWP_INCOMPATIBLE_TXN, -EIO, "STATUS_FWP_INCOMPATIBLE_TXN"},
+	{STATUS_FWP_TIMEOUT, -ETIMEDOUT, "STATUS_FWP_TIMEOUT"},
+	{STATUS_FWP_NET_EVENTS_DISABLED, -EIO,
+	"STATUS_FWP_NET_EVENTS_DISABLED"},
+	{STATUS_FWP_INCOMPATIBLE_LAYER, -EIO, "STATUS_FWP_INCOMPATIBLE_LAYER"},
+	{STATUS_FWP_KM_CLIENTS_ONLY, -EIO, "STATUS_FWP_KM_CLIENTS_ONLY"},
+	{STATUS_FWP_LIFETIME_MISMATCH, -EIO, "STATUS_FWP_LIFETIME_MISMATCH"},
+	{STATUS_FWP_BUILTIN_OBJECT, -EIO, "STATUS_FWP_BUILTIN_OBJECT"},
+	{STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS, -EIO,
+	"STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS"},
+	{STATUS_FWP_TOO_MANY_CALLOUTS, -EIO, "STATUS_FWP_TOO_MANY_CALLOUTS"},
+	{STATUS_FWP_NOTIFICATION_DROPPED, -EIO,
+	"STATUS_FWP_NOTIFICATION_DROPPED"},
+	{STATUS_FWP_TRAFFIC_MISMATCH, -EIO, "STATUS_FWP_TRAFFIC_MISMATCH"},
+	{STATUS_FWP_INCOMPATIBLE_SA_STATE, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_SA_STATE"},
+	{STATUS_FWP_NULL_POINTER, -EIO, "STATUS_FWP_NULL_POINTER"},
+	{STATUS_FWP_INVALID_ENUMERATOR, -EIO, "STATUS_FWP_INVALID_ENUMERATOR"},
+	{STATUS_FWP_INVALID_FLAGS, -EIO, "STATUS_FWP_INVALID_FLAGS"},
+	{STATUS_FWP_INVALID_NET_MASK, -EIO, "STATUS_FWP_INVALID_NET_MASK"},
+	{STATUS_FWP_INVALID_RANGE, -EIO, "STATUS_FWP_INVALID_RANGE"},
+	{STATUS_FWP_INVALID_INTERVAL, -EIO, "STATUS_FWP_INVALID_INTERVAL"},
+	{STATUS_FWP_ZERO_LENGTH_ARRAY, -EIO, "STATUS_FWP_ZERO_LENGTH_ARRAY"},
+	{STATUS_FWP_NULL_DISPLAY_NAME, -EIO, "STATUS_FWP_NULL_DISPLAY_NAME"},
+	{STATUS_FWP_INVALID_ACTION_TYPE, -EIO,
+	"STATUS_FWP_INVALID_ACTION_TYPE"},
+	{STATUS_FWP_INVALID_WEIGHT, -EIO, "STATUS_FWP_INVALID_WEIGHT"},
+	{STATUS_FWP_MATCH_TYPE_MISMATCH, -EIO,
+	"STATUS_FWP_MATCH_TYPE_MISMATCH"},
+	{STATUS_FWP_TYPE_MISMATCH, -EIO, "STATUS_FWP_TYPE_MISMATCH"},
+	{STATUS_FWP_OUT_OF_BOUNDS, -EIO, "STATUS_FWP_OUT_OF_BOUNDS"},
+	{STATUS_FWP_RESERVED, -EIO, "STATUS_FWP_RESERVED"},
+	{STATUS_FWP_DUPLICATE_CONDITION, -EIO,
+	"STATUS_FWP_DUPLICATE_CONDITION"},
+	{STATUS_FWP_DUPLICATE_KEYMOD, -EIO, "STATUS_FWP_DUPLICATE_KEYMOD"},
+	{STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER, -EIO,
+	"STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER"},
+	{STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER, -EIO,
+	"STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER"},
+	{STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER, -EIO,
+	"STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER"},
+	{STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT, -EIO,
+	"STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT"},
+	{STATUS_FWP_INCOMPATIBLE_AUTH_METHOD, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_AUTH_METHOD"},
+	{STATUS_FWP_INCOMPATIBLE_DH_GROUP, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_DH_GROUP"},
+	{STATUS_FWP_EM_NOT_SUPPORTED, -EOPNOTSUPP,
+	"STATUS_FWP_EM_NOT_SUPPORTED"},
+	{STATUS_FWP_NEVER_MATCH, -EIO, "STATUS_FWP_NEVER_MATCH"},
+	{STATUS_FWP_PROVIDER_CONTEXT_MISMATCH, -EIO,
+	"STATUS_FWP_PROVIDER_CONTEXT_MISMATCH"},
+	{STATUS_FWP_INVALID_PARAMETER, -EIO, "STATUS_FWP_INVALID_PARAMETER"},
+	{STATUS_FWP_TOO_MANY_SUBLAYERS, -EIO, "STATUS_FWP_TOO_MANY_SUBLAYERS"},
+	{STATUS_FWP_CALLOUT_NOTIFICATION_FAILED, -EIO,
+	"STATUS_FWP_CALLOUT_NOTIFICATION_FAILED"},
+	{STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG"},
+	{STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG, -EIO,
+	"STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG"},
+	{STATUS_FWP_TCPIP_NOT_READY, -EIO, "STATUS_FWP_TCPIP_NOT_READY"},
+	{STATUS_FWP_INJECT_HANDLE_CLOSING, -EIO,
+	"STATUS_FWP_INJECT_HANDLE_CLOSING"},
+	{STATUS_FWP_INJECT_HANDLE_STALE, -EIO,
+	"STATUS_FWP_INJECT_HANDLE_STALE"},
+	{STATUS_FWP_CANNOT_PEND, -EIO, "STATUS_FWP_CANNOT_PEND"},
+	{STATUS_NDIS_CLOSING, -EIO, "STATUS_NDIS_CLOSING"},
+	{STATUS_NDIS_BAD_VERSION, -EIO, "STATUS_NDIS_BAD_VERSION"},
+	{STATUS_NDIS_BAD_CHARACTERISTICS, -EIO,
+	"STATUS_NDIS_BAD_CHARACTERISTICS"},
+	{STATUS_NDIS_ADAPTER_NOT_FOUND, -EIO, "STATUS_NDIS_ADAPTER_NOT_FOUND"},
+	{STATUS_NDIS_OPEN_FAILED, -EIO, "STATUS_NDIS_OPEN_FAILED"},
+	{STATUS_NDIS_DEVICE_FAILED, -EIO, "STATUS_NDIS_DEVICE_FAILED"},
+	{STATUS_NDIS_MULTICAST_FULL, -EIO, "STATUS_NDIS_MULTICAST_FULL"},
+	{STATUS_NDIS_MULTICAST_EXISTS, -EIO, "STATUS_NDIS_MULTICAST_EXISTS"},
+	{STATUS_NDIS_MULTICAST_NOT_FOUND, -EIO,
+	"STATUS_NDIS_MULTICAST_NOT_FOUND"},
+	{STATUS_NDIS_REQUEST_ABORTED, -EIO, "STATUS_NDIS_REQUEST_ABORTED"},
+	{STATUS_NDIS_RESET_IN_PROGRESS, -EIO, "STATUS_NDIS_RESET_IN_PROGRESS"},
+	{STATUS_NDIS_INVALID_PACKET, -EIO, "STATUS_NDIS_INVALID_PACKET"},
+	{STATUS_NDIS_INVALID_DEVICE_REQUEST, -EIO,
+	"STATUS_NDIS_INVALID_DEVICE_REQUEST"},
+	{STATUS_NDIS_ADAPTER_NOT_READY, -EIO, "STATUS_NDIS_ADAPTER_NOT_READY"},
+	{STATUS_NDIS_INVALID_LENGTH, -EIO, "STATUS_NDIS_INVALID_LENGTH"},
+	{STATUS_NDIS_INVALID_DATA, -EIO, "STATUS_NDIS_INVALID_DATA"},
+	{STATUS_NDIS_BUFFER_TOO_SHORT, -ENOBUFS,
+	"STATUS_NDIS_BUFFER_TOO_SHORT"},
+	{STATUS_NDIS_INVALID_OID, -EIO, "STATUS_NDIS_INVALID_OID"},
+	{STATUS_NDIS_ADAPTER_REMOVED, -EIO, "STATUS_NDIS_ADAPTER_REMOVED"},
+	{STATUS_NDIS_UNSUPPORTED_MEDIA, -EIO, "STATUS_NDIS_UNSUPPORTED_MEDIA"},
+	{STATUS_NDIS_GROUP_ADDRESS_IN_USE, -EIO,
+	"STATUS_NDIS_GROUP_ADDRESS_IN_USE"},
+	{STATUS_NDIS_FILE_NOT_FOUND, -EIO, "STATUS_NDIS_FILE_NOT_FOUND"},
+	{STATUS_NDIS_ERROR_READING_FILE, -EIO,
+	"STATUS_NDIS_ERROR_READING_FILE"},
+	{STATUS_NDIS_ALREADY_MAPPED, -EIO, "STATUS_NDIS_ALREADY_MAPPED"},
+	{STATUS_NDIS_RESOURCE_CONFLICT, -EIO, "STATUS_NDIS_RESOURCE_CONFLICT"},
+	{STATUS_NDIS_MEDIA_DISCONNECTED, -EIO,
+	"STATUS_NDIS_MEDIA_DISCONNECTED"},
+	{STATUS_NDIS_INVALID_ADDRESS, -EIO, "STATUS_NDIS_INVALID_ADDRESS"},
+	{STATUS_NDIS_PAUSED, -EIO, "STATUS_NDIS_PAUSED"},
+	{STATUS_NDIS_INTERFACE_NOT_FOUND, -EIO,
+	"STATUS_NDIS_INTERFACE_NOT_FOUND"},
+	{STATUS_NDIS_UNSUPPORTED_REVISION, -EIO,
+	"STATUS_NDIS_UNSUPPORTED_REVISION"},
+	{STATUS_NDIS_INVALID_PORT, -EIO, "STATUS_NDIS_INVALID_PORT"},
+	{STATUS_NDIS_INVALID_PORT_STATE, -EIO,
+	"STATUS_NDIS_INVALID_PORT_STATE"},
+	{STATUS_NDIS_LOW_POWER_STATE, -EIO, "STATUS_NDIS_LOW_POWER_STATE"},
+	{STATUS_NDIS_NOT_SUPPORTED, -ENOSYS, "STATUS_NDIS_NOT_SUPPORTED"},
+	{STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED, -EIO,
+	"STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED"},
+	{STATUS_NDIS_DOT11_MEDIA_IN_USE, -EIO,
+	"STATUS_NDIS_DOT11_MEDIA_IN_USE"},
+	{STATUS_NDIS_DOT11_POWER_STATE_INVALID, -EIO,
+	"STATUS_NDIS_DOT11_POWER_STATE_INVALID"},
+	{STATUS_IPSEC_BAD_SPI, -EIO, "STATUS_IPSEC_BAD_SPI"},
+	{STATUS_IPSEC_SA_LIFETIME_EXPIRED, -EIO,
+	"STATUS_IPSEC_SA_LIFETIME_EXPIRED"},
+	{STATUS_IPSEC_WRONG_SA, -EIO, "STATUS_IPSEC_WRONG_SA"},
+	{STATUS_IPSEC_REPLAY_CHECK_FAILED, -EIO,
+	"STATUS_IPSEC_REPLAY_CHECK_FAILED"},
+	{STATUS_IPSEC_INVALID_PACKET, -EIO, "STATUS_IPSEC_INVALID_PACKET"},
+	{STATUS_IPSEC_INTEGRITY_CHECK_FAILED, -EIO,
+	"STATUS_IPSEC_INTEGRITY_CHECK_FAILED"},
+	{STATUS_IPSEC_CLEAR_TEXT_DROP, -EIO, "STATUS_IPSEC_CLEAR_TEXT_DROP"},
+	{0, 0, NULL}
+};
+
+/*****************************************************************************
+ Print an error message from the status code
+ *****************************************************************************/
+static void
+smb2_print_status(__le32 status)
+{
+	int idx = 0;
+
+	while (smb2_error_map_table[idx].status_string != NULL) {
+		if ((smb2_error_map_table[idx].smb2_status) == status) {
+			pr_notice("Status code returned 0x%08x %s\n", status,
+				  smb2_error_map_table[idx].status_string);
+		}
+		idx++;
+	}
+	return;
+}
+
+int
+map_smb2_to_linux_error(char *buf, bool log_err)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	unsigned int i;
+	int rc = -EIO;
+	__le32 smb2err = hdr->Status;
+
+	if (smb2err == 0)
+		return 0;
+
+	/* mask facility */
+	if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) &&
+	    (smb2err != STATUS_END_OF_FILE))
+		smb2_print_status(smb2err);
+	else if (cifsFYI & CIFS_RC)
+		smb2_print_status(smb2err);
+
+	for (i = 0; i < sizeof(smb2_error_map_table) /
+			sizeof(struct status_to_posix_error); i++) {
+		if (smb2_error_map_table[i].smb2_status == smb2err) {
+			rc = smb2_error_map_table[i].posix_error;
+			break;
+		}
+	}
+
+	/* on error mapping not found  - return EIO */
+
+	cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n",
+		 smb2err, rc);
+
+	return rc;
+}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
new file mode 100644
index 00000000000..b8021fde987
--- /dev/null
+++ b/fs/cifs/smb2misc.c
@@ -0,0 +1,609 @@
+/*
+ *   fs/cifs/smb2misc.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002,2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/ctype.h>
+#include "smb2pdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2status.h"
+
+static int
+check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
+{
+	/*
+	 * Make sure that this really is an SMB, that it is a response,
+	 * and that the message ids match.
+	 */
+	if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+	    (mid == hdr->MessageId)) {
+		if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+			return 0;
+		else {
+			/* only one valid case where server sends us request */
+			if (hdr->Command == SMB2_OPLOCK_BREAK)
+				return 0;
+			else
+				cifs_dbg(VFS, "Received Request not response\n");
+		}
+	} else { /* bad signature or mid */
+		if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
+			cifs_dbg(VFS, "Bad protocol string signature header %x\n",
+				 *(unsigned int *) hdr->ProtocolId);
+		if (mid != hdr->MessageId)
+			cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
+				 mid, hdr->MessageId);
+	}
+	cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId);
+	return 1;
+}
+
+/*
+ *  The following table defines the expected "StructureSize" of SMB2 responses
+ *  in order by SMB2 command.  This is similar to "wct" in SMB/CIFS responses.
+ *
+ *  Note that commands are defined in smb2pdu.h in le16 but the array below is
+ *  indexed by command in host byte order
+ */
+static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
+	/* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65),
+	/* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9),
+	/* SMB2_LOGOFF */ __constant_cpu_to_le16(4),
+	/* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16),
+	/* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4),
+	/* SMB2_CREATE */ __constant_cpu_to_le16(89),
+	/* SMB2_CLOSE */ __constant_cpu_to_le16(60),
+	/* SMB2_FLUSH */ __constant_cpu_to_le16(4),
+	/* SMB2_READ */ __constant_cpu_to_le16(17),
+	/* SMB2_WRITE */ __constant_cpu_to_le16(17),
+	/* SMB2_LOCK */ __constant_cpu_to_le16(4),
+	/* SMB2_IOCTL */ __constant_cpu_to_le16(49),
+	/* BB CHECK this ... not listed in documentation */
+	/* SMB2_CANCEL */ __constant_cpu_to_le16(0),
+	/* SMB2_ECHO */ __constant_cpu_to_le16(4),
+	/* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9),
+	/* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9),
+	/* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9),
+	/* SMB2_SET_INFO */ __constant_cpu_to_le16(2),
+	/* BB FIXME can also be 44 for lease break */
+	/* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24)
+};
+
+int
+smb2_check_message(char *buf, unsigned int length)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	__u64 mid = hdr->MessageId;
+	__u32 len = get_rfc1002_length(buf);
+	__u32 clc_len;  /* calculated length */
+	int command;
+
+	/* BB disable following printk later */
+	cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
+		 __func__, length, len);
+
+	/*
+	 * Add function to do table lookup of StructureSize by command
+	 * ie Validate the wct via smb2_struct_sizes table above
+	 */
+
+	if (length < sizeof(struct smb2_pdu)) {
+		if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
+			pdu->StructureSize2 = 0;
+			/*
+			 * As with SMB/CIFS, on some error cases servers may
+			 * not return wct properly
+			 */
+			return 0;
+		} else {
+			cifs_dbg(VFS, "Length less than SMB header size\n");
+		}
+		return 1;
+	}
+	if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
+		cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
+			 mid);
+		return 1;
+	}
+
+	if (check_smb2_hdr(hdr, mid))
+		return 1;
+
+	if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+		cifs_dbg(VFS, "Illegal structure size %u\n",
+			 le16_to_cpu(hdr->StructureSize));
+		return 1;
+	}
+
+	command = le16_to_cpu(hdr->Command);
+	if (command >= NUMBER_OF_SMB2_COMMANDS) {
+		cifs_dbg(VFS, "Illegal SMB2 command %d\n", command);
+		return 1;
+	}
+
+	if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
+		if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 ||
+		    pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
+			/* error packets have 9 byte structure size */
+			cifs_dbg(VFS, "Illegal response size %u for command %d\n",
+				 le16_to_cpu(pdu->StructureSize2), command);
+			return 1;
+		} else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0)
+			   && (le16_to_cpu(pdu->StructureSize2) != 44)
+			   && (le16_to_cpu(pdu->StructureSize2) != 36)) {
+			/* special case for SMB2.1 lease break message */
+			cifs_dbg(VFS, "Illegal response size %d for oplock break\n",
+				 le16_to_cpu(pdu->StructureSize2));
+			return 1;
+		}
+	}
+
+	if (4 + len != length) {
+		cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n",
+			 length, 4 + len, mid);
+		return 1;
+	}
+
+	clc_len = smb2_calc_size(hdr);
+
+	if (4 + len != clc_len) {
+		cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+			 clc_len, 4 + len, mid);
+		/* create failed on symlink */
+		if (command == SMB2_CREATE_HE &&
+		    hdr->Status == STATUS_STOPPED_ON_SYMLINK)
+			return 0;
+		/* Windows 7 server returns 24 bytes more */
+		if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
+			return 0;
+		/* server can return one byte more */
+		if (clc_len == 4 + len + 1)
+			return 0;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * The size of the variable area depends on the offset and length fields
+ * located in different fields for various SMB2 responses. SMB2 responses
+ * with no variable length info, show an offset of zero for the offset field.
+ */
+static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
+	/* SMB2_NEGOTIATE */ true,
+	/* SMB2_SESSION_SETUP */ true,
+	/* SMB2_LOGOFF */ false,
+	/* SMB2_TREE_CONNECT */	false,
+	/* SMB2_TREE_DISCONNECT */ false,
+	/* SMB2_CREATE */ true,
+	/* SMB2_CLOSE */ false,
+	/* SMB2_FLUSH */ false,
+	/* SMB2_READ */	true,
+	/* SMB2_WRITE */ false,
+	/* SMB2_LOCK */	false,
+	/* SMB2_IOCTL */ true,
+	/* SMB2_CANCEL */ false, /* BB CHECK this not listed in documentation */
+	/* SMB2_ECHO */ false,
+	/* SMB2_QUERY_DIRECTORY */ true,
+	/* SMB2_CHANGE_NOTIFY */ true,
+	/* SMB2_QUERY_INFO */ true,
+	/* SMB2_SET_INFO */ false,
+	/* SMB2_OPLOCK_BREAK */ false
+};
+
+/*
+ * Returns the pointer to the beginning of the data area. Length of the data
+ * area and the offset to it (from the beginning of the smb are also returned.
+ */
+char *
+smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+{
+	*off = 0;
+	*len = 0;
+
+	/* error responses do not have data area */
+	if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
+	    (((struct smb2_err_rsp *)hdr)->StructureSize) ==
+						SMB2_ERROR_STRUCTURE_SIZE2)
+		return NULL;
+
+	/*
+	 * Following commands have data areas so we have to get the location
+	 * of the data buffer offset and data buffer length for the particular
+	 * command.
+	 */
+	switch (hdr->Command) {
+	case SMB2_NEGOTIATE:
+		*off = le16_to_cpu(
+		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset);
+		*len = le16_to_cpu(
+		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength);
+		break;
+	case SMB2_SESSION_SETUP:
+		*off = le16_to_cpu(
+		    ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset);
+		*len = le16_to_cpu(
+		    ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength);
+		break;
+	case SMB2_CREATE:
+		*off = le32_to_cpu(
+		    ((struct smb2_create_rsp *)hdr)->CreateContextsOffset);
+		*len = le32_to_cpu(
+		    ((struct smb2_create_rsp *)hdr)->CreateContextsLength);
+		break;
+	case SMB2_QUERY_INFO:
+		*off = le16_to_cpu(
+		    ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset);
+		*len = le32_to_cpu(
+		    ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength);
+		break;
+	case SMB2_READ:
+		*off = ((struct smb2_read_rsp *)hdr)->DataOffset;
+		*len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength);
+		break;
+	case SMB2_QUERY_DIRECTORY:
+		*off = le16_to_cpu(
+		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset);
+		*len = le32_to_cpu(
+		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
+		break;
+	case SMB2_IOCTL:
+		*off = le32_to_cpu(
+		  ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
+		*len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+		break;
+	case SMB2_CHANGE_NOTIFY:
+	default:
+		/* BB FIXME for unimplemented cases above */
+		cifs_dbg(VFS, "no length check for command\n");
+		break;
+	}
+
+	/*
+	 * Invalid length or offset probably means data area is invalid, but
+	 * we have little choice but to ignore the data area in this case.
+	 */
+	if (*off > 4096) {
+		cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
+		*len = 0;
+		*off = 0;
+	} else if (*off < 0) {
+		cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
+			 *off);
+		*off = 0;
+		*len = 0;
+	} else if (*len < 0) {
+		cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
+			 *len);
+		*len = 0;
+	} else if (*len > 128 * 1024) {
+		cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
+		*len = 0;
+	}
+
+	/* return pointer to beginning of data area, ie offset from SMB start */
+	if ((*off != 0) && (*len != 0))
+		return hdr->ProtocolId + *off;
+	else
+		return NULL;
+}
+
+/*
+ * Calculate the size of the SMB message based on the fixed header
+ * portion, the number of word parameters and the data portion of the message.
+ */
+unsigned int
+smb2_calc_size(void *buf)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	int offset; /* the offset from the beginning of SMB to data area */
+	int data_length; /* the length of the variable length data area */
+	/* Structure Size has already been checked to make sure it is 64 */
+	int len = 4 + le16_to_cpu(pdu->hdr.StructureSize);
+
+	/*
+	 * StructureSize2, ie length of fixed parameter area has already
+	 * been checked to make sure it is the correct length.
+	 */
+	len += le16_to_cpu(pdu->StructureSize2);
+
+	if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false)
+		goto calc_size_exit;
+
+	smb2_get_data_area_len(&offset, &data_length, hdr);
+	cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
+
+	if (data_length > 0) {
+		/*
+		 * Check to make sure that data area begins after fixed area,
+		 * Note that last byte of the fixed area is part of data area
+		 * for some commands, typically those with odd StructureSize,
+		 * so we must add one to the calculation (and 4 to account for
+		 * the size of the RFC1001 hdr.
+		 */
+		if (offset + 4 + 1 < len) {
+			cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
+				 offset + 4 + 1, len);
+			data_length = 0;
+		} else {
+			len = 4 + offset + data_length;
+		}
+	}
+calc_size_exit:
+	cifs_dbg(FYI, "SMB2 len %d\n", len);
+	return len;
+}
+
+/* Note: caller must free return buffer */
+__le16 *
+cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
+{
+	int len;
+	const char *start_of_path;
+	__le16 *to;
+
+	/* Windows doesn't allow paths beginning with \ */
+	if (from[0] == '\\')
+		start_of_path = from + 1;
+	else
+		start_of_path = from;
+	to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
+				   cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	return to;
+}
+
+__le32
+smb2_get_lease_state(struct cifsInodeInfo *cinode)
+{
+	__le32 lease = 0;
+
+	if (CIFS_CACHE_WRITE(cinode))
+		lease |= SMB2_LEASE_WRITE_CACHING;
+	if (CIFS_CACHE_HANDLE(cinode))
+		lease |= SMB2_LEASE_HANDLE_CACHING;
+	if (CIFS_CACHE_READ(cinode))
+		lease |= SMB2_LEASE_READ_CACHING;
+	return lease;
+}
+
+struct smb2_lease_break_work {
+	struct work_struct lease_break;
+	struct tcon_link *tlink;
+	__u8 lease_key[16];
+	__le32 lease_state;
+};
+
+static void
+cifs_ses_oplock_break(struct work_struct *work)
+{
+	struct smb2_lease_break_work *lw = container_of(work,
+				struct smb2_lease_break_work, lease_break);
+	int rc;
+
+	rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key,
+			      lw->lease_state);
+	cifs_dbg(FYI, "Lease release rc %d\n", rc);
+	cifs_put_tlink(lw->tlink);
+	kfree(lw);
+}
+
+static bool
+smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
+		    struct smb2_lease_break_work *lw)
+{
+	bool found;
+	__u8 lease_state;
+	struct list_head *tmp;
+	struct cifsFileInfo *cfile;
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct cifs_pending_open *open;
+	struct cifsInodeInfo *cinode;
+	int ack_req = le32_to_cpu(rsp->Flags &
+				  SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
+
+	lease_state = le32_to_cpu(rsp->NewLeaseState);
+
+	list_for_each(tmp, &tcon->openFileList) {
+		cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+		cinode = CIFS_I(cfile->dentry->d_inode);
+
+		if (memcmp(cinode->lease_key, rsp->LeaseKey,
+							SMB2_LEASE_KEY_SIZE))
+			continue;
+
+		cifs_dbg(FYI, "found in the open list\n");
+		cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+			 le32_to_cpu(rsp->NewLeaseState));
+
+		server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
+
+		if (ack_req)
+			cfile->oplock_break_cancelled = false;
+		else
+			cfile->oplock_break_cancelled = true;
+
+		queue_work(cifsiod_wq, &cfile->oplock_break);
+		kfree(lw);
+		return true;
+	}
+
+	found = false;
+	list_for_each_entry(open, &tcon->pending_opens, olist) {
+		if (memcmp(open->lease_key, rsp->LeaseKey,
+			   SMB2_LEASE_KEY_SIZE))
+			continue;
+
+		if (!found && ack_req) {
+			found = true;
+			memcpy(lw->lease_key, open->lease_key,
+			       SMB2_LEASE_KEY_SIZE);
+			lw->tlink = cifs_get_tlink(open->tlink);
+			queue_work(cifsiod_wq, &lw->lease_break);
+		}
+
+		cifs_dbg(FYI, "found in the pending open list\n");
+		cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+			 le32_to_cpu(rsp->NewLeaseState));
+
+		open->oplock = lease_state;
+	}
+	return found;
+}
+
+static bool
+smb2_is_valid_lease_break(char *buffer)
+{
+	struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct smb2_lease_break_work *lw;
+
+	lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
+	if (!lw)
+		return false;
+
+	INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
+	lw->lease_state = rsp->NewLeaseState;
+
+	cifs_dbg(FYI, "Checking for lease break\n");
+
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list);
+
+		list_for_each(tmp1, &server->smb_ses_list) {
+			ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
+
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &ses->tcon_list) {
+				tcon = list_entry(tmp2, struct cifs_tcon,
+						  tcon_list);
+				cifs_stats_inc(
+				    &tcon->stats.cifs_stats.num_oplock_brks);
+				if (smb2_tcon_has_lease(tcon, rsp, lw)) {
+					spin_unlock(&cifs_file_list_lock);
+					spin_unlock(&cifs_tcp_ses_lock);
+					return true;
+				}
+			}
+			spin_unlock(&cifs_file_list_lock);
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	kfree(lw);
+	cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
+	return false;
+}
+
+bool
+smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
+{
+	struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct cifsInodeInfo *cinode;
+	struct cifsFileInfo *cfile;
+
+	cifs_dbg(FYI, "Checking for oplock break\n");
+
+	if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
+		return false;
+
+	if (rsp->StructureSize !=
+				smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
+		if (le16_to_cpu(rsp->StructureSize) == 44)
+			return smb2_is_valid_lease_break(buffer);
+		else
+			return false;
+	}
+
+	cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel);
+
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+
+			cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &tcon->openFileList) {
+				cfile = list_entry(tmp2, struct cifsFileInfo,
+						     tlist);
+				if (rsp->PersistentFid !=
+				    cfile->fid.persistent_fid ||
+				    rsp->VolatileFid !=
+				    cfile->fid.volatile_fid)
+					continue;
+
+				cifs_dbg(FYI, "file id match, oplock break\n");
+				cinode = CIFS_I(cfile->dentry->d_inode);
+
+				if (!CIFS_CACHE_WRITE(cinode) &&
+				    rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
+					cfile->oplock_break_cancelled = true;
+				else
+					cfile->oplock_break_cancelled = false;
+
+				set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+					&cinode->flags);
+
+				/*
+				 * Set flag if the server downgrades the oplock
+				 * to L2 else clear.
+				 */
+				if (rsp->OplockLevel)
+					set_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &cinode->flags);
+				else
+					clear_bit(
+					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+					   &cinode->flags);
+
+				queue_work(cifsiod_wq, &cfile->oplock_break);
+
+				spin_unlock(&cifs_file_list_lock);
+				spin_unlock(&cifs_tcp_ses_lock);
+				return true;
+			}
+			spin_unlock(&cifs_file_list_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
+			cifs_dbg(FYI, "No matching file for oplock break\n");
+			return true;
+		}
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+	cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
+	return false;
+}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
new file mode 100644
index 00000000000..787844bde38
--- /dev/null
+++ b/fs/cifs/smb2ops.c
@@ -0,0 +1,1413 @@
+/*
+ *  SMB2 version specific operations
+ *
+ *  Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ *  This library is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License v2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *  the GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include "cifsglob.h"
+#include "smb2pdu.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifs_unicode.h"
+#include "smb2status.h"
+#include "smb2glob.h"
+
+static int
+change_conf(struct TCP_Server_Info *server)
+{
+	server->credits += server->echo_credits + server->oplock_credits;
+	server->oplock_credits = server->echo_credits = 0;
+	switch (server->credits) {
+	case 0:
+		return -1;
+	case 1:
+		server->echoes = false;
+		server->oplocks = false;
+		cifs_dbg(VFS, "disabling echoes and oplocks\n");
+		break;
+	case 2:
+		server->echoes = true;
+		server->oplocks = false;
+		server->echo_credits = 1;
+		cifs_dbg(FYI, "disabling oplocks\n");
+		break;
+	default:
+		server->echoes = true;
+		server->oplocks = true;
+		server->echo_credits = 1;
+		server->oplock_credits = 1;
+	}
+	server->credits -= server->echo_credits + server->oplock_credits;
+	return 0;
+}
+
+static void
+smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
+		 const int optype)
+{
+	int *val, rc = 0;
+	spin_lock(&server->req_lock);
+	val = server->ops->get_credits_field(server, optype);
+	*val += add;
+	server->in_flight--;
+	if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP)
+		rc = change_conf(server);
+	/*
+	 * Sometimes server returns 0 credits on oplock break ack - we need to
+	 * rebalance credits in this case.
+	 */
+	else if (server->in_flight > 0 && server->oplock_credits == 0 &&
+		 server->oplocks) {
+		if (server->credits > 1) {
+			server->credits--;
+			server->oplock_credits++;
+		}
+	}
+	spin_unlock(&server->req_lock);
+	wake_up(&server->request_q);
+	if (rc)
+		cifs_reconnect(server);
+}
+
+static void
+smb2_set_credits(struct TCP_Server_Info *server, const int val)
+{
+	spin_lock(&server->req_lock);
+	server->credits = val;
+	spin_unlock(&server->req_lock);
+}
+
+static int *
+smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
+{
+	switch (optype) {
+	case CIFS_ECHO_OP:
+		return &server->echo_credits;
+	case CIFS_OBREAK_OP:
+		return &server->oplock_credits;
+	default:
+		return &server->credits;
+	}
+}
+
+static unsigned int
+smb2_get_credits(struct mid_q_entry *mid)
+{
+	return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
+}
+
+static __u64
+smb2_get_next_mid(struct TCP_Server_Info *server)
+{
+	__u64 mid;
+	/* for SMB2 we need the current value */
+	spin_lock(&GlobalMid_Lock);
+	mid = server->CurrentMid++;
+	spin_unlock(&GlobalMid_Lock);
+	return mid;
+}
+
+static struct mid_q_entry *
+smb2_find_mid(struct TCP_Server_Info *server, char *buf)
+{
+	struct mid_q_entry *mid;
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+		if ((mid->mid == hdr->MessageId) &&
+		    (mid->mid_state == MID_REQUEST_SUBMITTED) &&
+		    (mid->command == hdr->Command)) {
+			spin_unlock(&GlobalMid_Lock);
+			return mid;
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return NULL;
+}
+
+static void
+smb2_dump_detail(void *buf)
+{
+#ifdef CONFIG_CIFS_DEBUG2
+	struct smb2_hdr *smb = (struct smb2_hdr *)buf;
+
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
+		 smb->Command, smb->Status, smb->Flags, smb->MessageId,
+		 smb->ProcessId);
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb));
+#endif
+}
+
+static bool
+smb2_need_neg(struct TCP_Server_Info *server)
+{
+	return server->max_read == 0;
+}
+
+static int
+smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	int rc;
+	ses->server->CurrentMid = 0;
+	rc = SMB2_negotiate(xid, ses);
+	/* BB we probably don't need to retry with modern servers */
+	if (rc == -EAGAIN)
+		rc = -EHOSTDOWN;
+	return rc;
+}
+
+static unsigned int
+smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int wsize;
+
+	/* start with specified wsize, or default */
+	wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
+	wsize = min_t(unsigned int, wsize, server->max_write);
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+
+	return wsize;
+}
+
+static unsigned int
+smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+	unsigned int rsize;
+
+	/* start with specified rsize, or default */
+	rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
+	rsize = min_t(unsigned int, rsize, server->max_read);
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+
+	return rsize;
+}
+
+#ifdef CONFIG_CIFS_STATS2
+static int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	unsigned int ret_data_len = 0;
+	struct network_interface_info_ioctl_rsp *out_buf;
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+			FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+			NULL /* no data input */, 0 /* no data input */,
+			(char **)&out_buf, &ret_data_len);
+
+	if ((rc == 0)  && (ret_data_len > 0)) {
+		/* Dump info on first interface */
+		cifs_dbg(FYI, "Adapter Capability 0x%x\t",
+			le32_to_cpu(out_buf->Capability));
+		cifs_dbg(FYI, "Link Speed %lld\n",
+			le64_to_cpu(out_buf->LinkSpeed));
+	} else
+		cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+
+	return rc;
+}
+#endif /* STATS2 */
+
+static void
+smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+#ifdef CONFIG_CIFS_STATS2
+	SMB3_request_interfaces(xid, tcon);
+#endif /* STATS2 */
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
+static void
+smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
+static int
+smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
+			struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	if (rc) {
+		kfree(utf16_path);
+		return rc;
+	}
+
+	rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	kfree(utf16_path);
+	return rc;
+}
+
+static int
+smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+		  struct cifs_sb_info *cifs_sb, const char *full_path,
+		  u64 *uniqueid, FILE_ALL_INFO *data)
+{
+	*uniqueid = le64_to_cpu(data->IndexNumber);
+	return 0;
+}
+
+static int
+smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+		     struct cifs_fid *fid, FILE_ALL_INFO *data)
+{
+	int rc;
+	struct smb2_file_all_info *smb2_data;
+
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+			    GFP_KERNEL);
+	if (smb2_data == NULL)
+		return -ENOMEM;
+
+	rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid,
+			     smb2_data);
+	if (!rc)
+		move_smb2_info_to_cifs(data, smb2_data);
+	kfree(smb2_data);
+	return rc;
+}
+
+static bool
+smb2_can_echo(struct TCP_Server_Info *server)
+{
+	return server->echoes;
+}
+
+static void
+smb2_clear_stats(struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	int i;
+	for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) {
+		atomic_set(&tcon->stats.smb2_stats.smb2_com_sent[i], 0);
+		atomic_set(&tcon->stats.smb2_stats.smb2_com_failed[i], 0);
+	}
+#endif
+}
+
+static void
+smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
+{
+	seq_puts(m, "\n\tShare Capabilities:");
+	if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
+		seq_puts(m, " DFS,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+		seq_puts(m, " CONTINUOUS AVAILABILITY,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
+		seq_puts(m, " SCALEOUT,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
+		seq_puts(m, " CLUSTER,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
+		seq_puts(m, " ASYMMETRIC,");
+	if (tcon->capabilities == 0)
+		seq_puts(m, " None");
+	if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE)
+		seq_puts(m, " Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE)
+		seq_puts(m, " Partition Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY)
+		seq_puts(m, " SSD,");
+	if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED)
+		seq_puts(m, " TRIM-support,");
+
+	seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+	if (tcon->perf_sector_size)
+		seq_printf(m, "\tOptimal sector size: 0x%x",
+			   tcon->perf_sector_size);
+}
+
+static void
+smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
+{
+#ifdef CONFIG_CIFS_STATS
+	atomic_t *sent = tcon->stats.smb2_stats.smb2_com_sent;
+	atomic_t *failed = tcon->stats.smb2_stats.smb2_com_failed;
+	seq_printf(m, "\nNegotiates: %d sent %d failed",
+		   atomic_read(&sent[SMB2_NEGOTIATE_HE]),
+		   atomic_read(&failed[SMB2_NEGOTIATE_HE]));
+	seq_printf(m, "\nSessionSetups: %d sent %d failed",
+		   atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
+		   atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
+	seq_printf(m, "\nLogoffs: %d sent %d failed",
+		   atomic_read(&sent[SMB2_LOGOFF_HE]),
+		   atomic_read(&failed[SMB2_LOGOFF_HE]));
+	seq_printf(m, "\nTreeConnects: %d sent %d failed",
+		   atomic_read(&sent[SMB2_TREE_CONNECT_HE]),
+		   atomic_read(&failed[SMB2_TREE_CONNECT_HE]));
+	seq_printf(m, "\nTreeDisconnects: %d sent %d failed",
+		   atomic_read(&sent[SMB2_TREE_DISCONNECT_HE]),
+		   atomic_read(&failed[SMB2_TREE_DISCONNECT_HE]));
+	seq_printf(m, "\nCreates: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CREATE_HE]),
+		   atomic_read(&failed[SMB2_CREATE_HE]));
+	seq_printf(m, "\nCloses: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CLOSE_HE]),
+		   atomic_read(&failed[SMB2_CLOSE_HE]));
+	seq_printf(m, "\nFlushes: %d sent %d failed",
+		   atomic_read(&sent[SMB2_FLUSH_HE]),
+		   atomic_read(&failed[SMB2_FLUSH_HE]));
+	seq_printf(m, "\nReads: %d sent %d failed",
+		   atomic_read(&sent[SMB2_READ_HE]),
+		   atomic_read(&failed[SMB2_READ_HE]));
+	seq_printf(m, "\nWrites: %d sent %d failed",
+		   atomic_read(&sent[SMB2_WRITE_HE]),
+		   atomic_read(&failed[SMB2_WRITE_HE]));
+	seq_printf(m, "\nLocks: %d sent %d failed",
+		   atomic_read(&sent[SMB2_LOCK_HE]),
+		   atomic_read(&failed[SMB2_LOCK_HE]));
+	seq_printf(m, "\nIOCTLs: %d sent %d failed",
+		   atomic_read(&sent[SMB2_IOCTL_HE]),
+		   atomic_read(&failed[SMB2_IOCTL_HE]));
+	seq_printf(m, "\nCancels: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CANCEL_HE]),
+		   atomic_read(&failed[SMB2_CANCEL_HE]));
+	seq_printf(m, "\nEchos: %d sent %d failed",
+		   atomic_read(&sent[SMB2_ECHO_HE]),
+		   atomic_read(&failed[SMB2_ECHO_HE]));
+	seq_printf(m, "\nQueryDirectories: %d sent %d failed",
+		   atomic_read(&sent[SMB2_QUERY_DIRECTORY_HE]),
+		   atomic_read(&failed[SMB2_QUERY_DIRECTORY_HE]));
+	seq_printf(m, "\nChangeNotifies: %d sent %d failed",
+		   atomic_read(&sent[SMB2_CHANGE_NOTIFY_HE]),
+		   atomic_read(&failed[SMB2_CHANGE_NOTIFY_HE]));
+	seq_printf(m, "\nQueryInfos: %d sent %d failed",
+		   atomic_read(&sent[SMB2_QUERY_INFO_HE]),
+		   atomic_read(&failed[SMB2_QUERY_INFO_HE]));
+	seq_printf(m, "\nSetInfos: %d sent %d failed",
+		   atomic_read(&sent[SMB2_SET_INFO_HE]),
+		   atomic_read(&failed[SMB2_SET_INFO_HE]));
+	seq_printf(m, "\nOplockBreaks: %d sent %d failed",
+		   atomic_read(&sent[SMB2_OPLOCK_BREAK_HE]),
+		   atomic_read(&failed[SMB2_OPLOCK_BREAK_HE]));
+#endif
+}
+
+static void
+smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
+{
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+
+	cfile->fid.persistent_fid = fid->persistent_fid;
+	cfile->fid.volatile_fid = fid->volatile_fid;
+	server->ops->set_oplock_level(cinode, oplock, fid->epoch,
+				      &fid->purge_cache);
+	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
+}
+
+static void
+smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+}
+
+static int
+SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid,
+		     struct copychunk_ioctl *pcchunk)
+{
+	int rc;
+	unsigned int ret_data_len;
+	struct resume_key_req *res_key;
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
+			NULL, 0 /* no input */,
+			(char **)&res_key, &ret_data_len);
+
+	if (rc) {
+		cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
+		goto req_res_key_exit;
+	}
+	if (ret_data_len < sizeof(struct resume_key_req)) {
+		cifs_dbg(VFS, "Invalid refcopy resume key length\n");
+		rc = -EINVAL;
+		goto req_res_key_exit;
+	}
+	memcpy(pcchunk->SourceKey, res_key->ResumeKey, COPY_CHUNK_RES_KEY_SIZE);
+
+req_res_key_exit:
+	kfree(res_key);
+	return rc;
+}
+
+static int
+smb2_clone_range(const unsigned int xid,
+			struct cifsFileInfo *srcfile,
+			struct cifsFileInfo *trgtfile, u64 src_off,
+			u64 len, u64 dest_off)
+{
+	int rc;
+	unsigned int ret_data_len;
+	struct copychunk_ioctl *pcchunk;
+	struct copychunk_ioctl_rsp *retbuf = NULL;
+	struct cifs_tcon *tcon;
+	int chunks_copied = 0;
+	bool chunk_sizes_updated = false;
+
+	pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+
+	if (pcchunk == NULL)
+		return -ENOMEM;
+
+	cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n");
+	/* Request a key from the server to identify the source of the copy */
+	rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink),
+				srcfile->fid.persistent_fid,
+				srcfile->fid.volatile_fid, pcchunk);
+
+	/* Note: request_res_key sets res_key null only if rc !=0 */
+	if (rc)
+		goto cchunk_out;
+
+	/* For now array only one chunk long, will make more flexible later */
+	pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+	pcchunk->Reserved = 0;
+	pcchunk->Reserved2 = 0;
+
+	tcon = tlink_tcon(trgtfile->tlink);
+
+	while (len > 0) {
+		pcchunk->SourceOffset = cpu_to_le64(src_off);
+		pcchunk->TargetOffset = cpu_to_le64(dest_off);
+		pcchunk->Length =
+			cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
+
+		/* Request server copy to target from src identified by key */
+		rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
+			trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
+			true /* is_fsctl */, (char *)pcchunk,
+			sizeof(struct copychunk_ioctl),	(char **)&retbuf,
+			&ret_data_len);
+		if (rc == 0) {
+			if (ret_data_len !=
+					sizeof(struct copychunk_ioctl_rsp)) {
+				cifs_dbg(VFS, "invalid cchunk response size\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			if (retbuf->TotalBytesWritten == 0) {
+				cifs_dbg(FYI, "no bytes copied\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			/*
+			 * Check if server claimed to write more than we asked
+			 */
+			if (le32_to_cpu(retbuf->TotalBytesWritten) >
+			    le32_to_cpu(pcchunk->Length)) {
+				cifs_dbg(VFS, "invalid copy chunk response\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			if (le32_to_cpu(retbuf->ChunksWritten) != 1) {
+				cifs_dbg(VFS, "invalid num chunks written\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			chunks_copied++;
+
+			src_off += le32_to_cpu(retbuf->TotalBytesWritten);
+			dest_off += le32_to_cpu(retbuf->TotalBytesWritten);
+			len -= le32_to_cpu(retbuf->TotalBytesWritten);
+
+			cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n",
+				le32_to_cpu(retbuf->ChunksWritten),
+				le32_to_cpu(retbuf->ChunkBytesWritten),
+				le32_to_cpu(retbuf->TotalBytesWritten));
+		} else if (rc == -EINVAL) {
+			if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
+				goto cchunk_out;
+
+			cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n",
+				le32_to_cpu(retbuf->ChunksWritten),
+				le32_to_cpu(retbuf->ChunkBytesWritten),
+				le32_to_cpu(retbuf->TotalBytesWritten));
+
+			/*
+			 * Check if this is the first request using these sizes,
+			 * (ie check if copy succeed once with original sizes
+			 * and check if the server gave us different sizes after
+			 * we already updated max sizes on previous request).
+			 * if not then why is the server returning an error now
+			 */
+			if ((chunks_copied != 0) || chunk_sizes_updated)
+				goto cchunk_out;
+
+			/* Check that server is not asking us to grow size */
+			if (le32_to_cpu(retbuf->ChunkBytesWritten) <
+					tcon->max_bytes_chunk)
+				tcon->max_bytes_chunk =
+					le32_to_cpu(retbuf->ChunkBytesWritten);
+			else
+				goto cchunk_out; /* server gave us bogus size */
+
+			/* No need to change MaxChunks since already set to 1 */
+			chunk_sizes_updated = true;
+		}
+	}
+
+cchunk_out:
+	kfree(pcchunk);
+	return rc;
+}
+
+static int
+smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
+		struct cifs_fid *fid)
+{
+	return SMB2_flush(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+}
+
+static unsigned int
+smb2_read_data_offset(char *buf)
+{
+	struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+	return rsp->DataOffset;
+}
+
+static unsigned int
+smb2_read_data_length(char *buf)
+{
+	struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+	return le32_to_cpu(rsp->DataLength);
+}
+
+
+static int
+smb2_sync_read(const unsigned int xid, struct cifsFileInfo *cfile,
+	       struct cifs_io_parms *parms, unsigned int *bytes_read,
+	       char **buf, int *buf_type)
+{
+	parms->persistent_fid = cfile->fid.persistent_fid;
+	parms->volatile_fid = cfile->fid.volatile_fid;
+	return SMB2_read(xid, parms, bytes_read, buf, buf_type);
+}
+
+static int
+smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
+		struct cifs_io_parms *parms, unsigned int *written,
+		struct kvec *iov, unsigned long nr_segs)
+{
+
+	parms->persistent_fid = cfile->fid.persistent_fid;
+	parms->volatile_fid = cfile->fid.volatile_fid;
+	return SMB2_write(xid, parms, written, iov, nr_segs);
+}
+
+static int
+smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
+{
+	__le64 eof = cpu_to_le64(size);
+	return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+			    cfile->fid.volatile_fid, cfile->pid, &eof);
+}
+
+static int
+smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid,
+			    cfile->fid.volatile_fid);
+}
+
+static int
+smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
+		     const char *path, struct cifs_sb_info *cifs_sb,
+		     struct cifs_fid *fid, __u16 search_flags,
+		     struct cifs_search_info *srch_inf)
+{
+	__le16 *utf16_path;
+	int rc;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+
+	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+	kfree(utf16_path);
+	if (rc) {
+		cifs_dbg(VFS, "open dir failed\n");
+		return rc;
+	}
+
+	srch_inf->entries_in_buffer = 0;
+	srch_inf->index_of_last_entry = 0;
+
+	rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
+				  fid->volatile_fid, 0, srch_inf);
+	if (rc) {
+		cifs_dbg(VFS, "query directory failed\n");
+		SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+	}
+	return rc;
+}
+
+static int
+smb2_query_dir_next(const unsigned int xid, struct cifs_tcon *tcon,
+		    struct cifs_fid *fid, __u16 search_flags,
+		    struct cifs_search_info *srch_inf)
+{
+	return SMB2_query_directory(xid, tcon, fid->persistent_fid,
+				    fid->volatile_fid, 0, srch_inf);
+}
+
+static int
+smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
+	       struct cifs_fid *fid)
+{
+	return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+}
+
+/*
+* If we negotiate SMB2 protocol and get STATUS_PENDING - update
+* the number of credits and return true. Otherwise - return false.
+*/
+static bool
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+{
+	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+
+	if (hdr->Status != STATUS_PENDING)
+		return false;
+
+	if (!length) {
+		spin_lock(&server->req_lock);
+		server->credits += le16_to_cpu(hdr->CreditRequest);
+		spin_unlock(&server->req_lock);
+		wake_up(&server->request_q);
+	}
+
+	return true;
+}
+
+static int
+smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
+		     struct cifsInodeInfo *cinode)
+{
+	if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+		return SMB2_lease_break(0, tcon, cinode->lease_key,
+					smb2_get_lease_state(cinode));
+
+	return SMB2_oplock_break(0, tcon, fid->persistent_fid,
+				 fid->volatile_fid,
+				 CIFS_CACHE_READ(cinode) ? 1 : 0);
+}
+
+static int
+smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
+	     struct kstatfs *buf)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return rc;
+	buf->f_type = SMB2_MAGIC_NUMBER;
+	rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			   buf);
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return rc;
+}
+
+static bool
+smb2_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
+{
+	return ob1->fid.persistent_fid == ob2->fid.persistent_fid &&
+	       ob1->fid.volatile_fid == ob2->fid.volatile_fid;
+}
+
+static int
+smb2_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
+	       __u64 length, __u32 type, int lock, int unlock, bool wait)
+{
+	if (unlock && !lock)
+		type = SMB2_LOCKFLAG_UNLOCK;
+	return SMB2_lock(xid, tlink_tcon(cfile->tlink),
+			 cfile->fid.persistent_fid, cfile->fid.volatile_fid,
+			 current->tgid, length, offset, type, wait);
+}
+
+static void
+smb2_get_lease_key(struct inode *inode, struct cifs_fid *fid)
+{
+	memcpy(fid->lease_key, CIFS_I(inode)->lease_key, SMB2_LEASE_KEY_SIZE);
+}
+
+static void
+smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid)
+{
+	memcpy(CIFS_I(inode)->lease_key, fid->lease_key, SMB2_LEASE_KEY_SIZE);
+}
+
+static void
+smb2_new_lease_key(struct cifs_fid *fid)
+{
+	get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
+}
+
+static int
+smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, char **target_path,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+	struct smb2_err_rsp *err_buf = NULL;
+	struct smb2_symlink_err_rsp *symlink;
+	unsigned int sub_len, sub_offset;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf);
+
+	if (!rc || !err_buf) {
+		kfree(utf16_path);
+		return -ENOENT;
+	}
+	/* open must fail on symlink - reset rc */
+	rc = 0;
+	symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
+	sub_len = le16_to_cpu(symlink->SubstituteNameLength);
+	sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
+	*target_path = cifs_strndup_from_utf16(
+				(char *)symlink->PathBuffer + sub_offset,
+				sub_len, true, cifs_sb->local_nls);
+	if (!(*target_path)) {
+		kfree(utf16_path);
+		return -ENOMEM;
+	}
+	convert_delimiter(*target_path, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	kfree(utf16_path);
+	return rc;
+}
+
+static void
+smb2_downgrade_oplock(struct TCP_Server_Info *server,
+			struct cifsInodeInfo *cinode, bool set_level2)
+{
+	if (set_level2)
+		server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
+						0, NULL);
+	else
+		server->ops->set_oplock_level(cinode, 0, 0, NULL);
+}
+
+static void
+smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		      unsigned int epoch, bool *purge_cache)
+{
+	oplock &= 0xFF;
+	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
+		return;
+	if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+		cinode->oplock = CIFS_CACHE_RHW_FLG;
+		cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+		cinode->oplock = CIFS_CACHE_RW_FLG;
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
+		cinode->oplock = CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else
+		cinode->oplock = 0;
+}
+
+static void
+smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		       unsigned int epoch, bool *purge_cache)
+{
+	char message[5] = {0};
+
+	oplock &= 0xFF;
+	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
+		return;
+
+	cinode->oplock = 0;
+	if (oplock & SMB2_LEASE_READ_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_READ_FLG;
+		strcat(message, "R");
+	}
+	if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_HANDLE_FLG;
+		strcat(message, "H");
+	}
+	if (oplock & SMB2_LEASE_WRITE_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_WRITE_FLG;
+		strcat(message, "W");
+	}
+	if (!cinode->oplock)
+		strcat(message, "None");
+	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
+		 &cinode->vfs_inode);
+}
+
+static void
+smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		      unsigned int epoch, bool *purge_cache)
+{
+	unsigned int old_oplock = cinode->oplock;
+
+	smb21_set_oplock_level(cinode, oplock, epoch, purge_cache);
+
+	if (purge_cache) {
+		*purge_cache = false;
+		if (old_oplock == CIFS_CACHE_READ_FLG) {
+			if (cinode->oplock == CIFS_CACHE_READ_FLG &&
+			    (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RH_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+			else if (cinode->oplock == 0 &&
+				 (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+		} else if (old_oplock == CIFS_CACHE_RH_FLG) {
+			if (cinode->oplock == CIFS_CACHE_RH_FLG &&
+			    (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+		}
+		cinode->epoch = epoch;
+	}
+}
+
+static bool
+smb2_is_read_op(__u32 oplock)
+{
+	return oplock == SMB2_OPLOCK_LEVEL_II;
+}
+
+static bool
+smb21_is_read_op(__u32 oplock)
+{
+	return (oplock & SMB2_LEASE_READ_CACHING_HE) &&
+	       !(oplock & SMB2_LEASE_WRITE_CACHING_HE);
+}
+
+static __le32
+map_oplock_to_lease(u8 oplock)
+{
+	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+		return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
+	else if (oplock == SMB2_OPLOCK_LEVEL_II)
+		return SMB2_LEASE_READ_CACHING;
+	else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
+		return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING |
+		       SMB2_LEASE_WRITE_CACHING;
+	return 0;
+}
+
+static char *
+smb2_create_lease_buf(u8 *lease_key, u8 oplock)
+{
+	struct create_lease *buf;
+
+	buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
+	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
+	buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_lease, lcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_lease, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_REQUEST_LEASE is "RqLs" */
+	buf->Name[0] = 'R';
+	buf->Name[1] = 'q';
+	buf->Name[2] = 'L';
+	buf->Name[3] = 's';
+	return (char *)buf;
+}
+
+static char *
+smb3_create_lease_buf(u8 *lease_key, u8 oplock)
+{
+	struct create_lease_v2 *buf;
+
+	buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
+	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
+	buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_lease_v2, lcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_lease_v2, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_REQUEST_LEASE is "RqLs" */
+	buf->Name[0] = 'R';
+	buf->Name[1] = 'q';
+	buf->Name[2] = 'L';
+	buf->Name[3] = 's';
+	return (char *)buf;
+}
+
+static __u8
+smb2_parse_lease_buf(void *buf, unsigned int *epoch)
+{
+	struct create_lease *lc = (struct create_lease *)buf;
+
+	*epoch = 0; /* not used */
+	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	return le32_to_cpu(lc->lcontext.LeaseState);
+}
+
+static __u8
+smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+{
+	struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
+
+	*epoch = le16_to_cpu(lc->lcontext.Epoch);
+	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	return le32_to_cpu(lc->lcontext.LeaseState);
+}
+
+struct smb_version_operations smb20_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.calc_signature = smb2_calc_signature,
+	.is_read_op = smb2_is_read_op,
+	.set_oplock_level = smb2_set_oplock_level,
+	.create_lease_buf = smb2_create_lease_buf,
+	.parse_lease_buf = smb2_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+};
+
+struct smb_version_operations smb21_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.calc_signature = smb2_calc_signature,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb21_set_oplock_level,
+	.create_lease_buf = smb2_create_lease_buf,
+	.parse_lease_buf = smb2_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+};
+
+struct smb_version_operations smb30_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.dump_share_caps = smb2_dump_share_caps,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb3_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.generate_signingkey = generate_smb3signingkey,
+	.calc_signature = smb3_calc_signature,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb3_set_oplock_level,
+	.create_lease_buf = smb3_create_lease_buf,
+	.parse_lease_buf = smb3_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+	.validate_negotiate = smb3_validate_negotiate,
+};
+
+struct smb_version_values smb20_values = {
+	.version_string = SMB20_VERSION_STRING,
+	.protocol_id = SMB20_PROT_ID,
+	.req_capabilities = 0, /* MBZ */
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease),
+};
+
+struct smb_version_values smb21_values = {
+	.version_string = SMB21_VERSION_STRING,
+	.protocol_id = SMB21_PROT_ID,
+	.req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease),
+};
+
+struct smb_version_values smb30_values = {
+	.version_string = SMB30_VERSION_STRING,
+	.protocol_id = SMB30_PROT_ID,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
+};
+
+struct smb_version_values smb302_values = {
+	.version_string = SMB302_VERSION_STRING,
+	.protocol_id = SMB302_PROT_ID,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
+};
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
new file mode 100644
index 00000000000..b0b260dbb19
--- /dev/null
+++ b/fs/cifs/smb2pdu.c
@@ -0,0 +1,2616 @@
+/*
+ *   fs/cifs/smb2pdu.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2009, 2013
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   Contains the routines for constructing the SMB2 PDUs themselves
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ /* SMB2 PDU handling routines here - except for leftovers (eg session setup) */
+ /* Note that there are handle based routines which must be		      */
+ /* treated slightly differently for reconnection purposes since we never     */
+ /* want to reuse a stale file handle and only the caller knows the file info */
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/vfs.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/xattr.h>
+#include "smb2pdu.h"
+#include "cifsglob.h"
+#include "cifsacl.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "ntlmssp.h"
+#include "smb2status.h"
+#include "smb2glob.h"
+#include "cifspdu.h"
+
+/*
+ *  The following table defines the expected "StructureSize" of SMB2 requests
+ *  in order by SMB2 command.  This is similar to "wct" in SMB/CIFS requests.
+ *
+ *  Note that commands are defined in smb2pdu.h in le16 but the array below is
+ *  indexed by command in host byte order.
+ */
+static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
+	/* SMB2_NEGOTIATE */ 36,
+	/* SMB2_SESSION_SETUP */ 25,
+	/* SMB2_LOGOFF */ 4,
+	/* SMB2_TREE_CONNECT */	9,
+	/* SMB2_TREE_DISCONNECT */ 4,
+	/* SMB2_CREATE */ 57,
+	/* SMB2_CLOSE */ 24,
+	/* SMB2_FLUSH */ 24,
+	/* SMB2_READ */	49,
+	/* SMB2_WRITE */ 49,
+	/* SMB2_LOCK */	48,
+	/* SMB2_IOCTL */ 57,
+	/* SMB2_CANCEL */ 4,
+	/* SMB2_ECHO */ 4,
+	/* SMB2_QUERY_DIRECTORY */ 33,
+	/* SMB2_CHANGE_NOTIFY */ 32,
+	/* SMB2_QUERY_INFO */ 41,
+	/* SMB2_SET_INFO */ 33,
+	/* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */
+};
+
+
+static void
+smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
+		  const struct cifs_tcon *tcon)
+{
+	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	char *temp = (char *)hdr;
+	/* lookup word count ie StructureSize from table */
+	__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_cmd)];
+
+	/*
+	 * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of
+	 * largest operations (Create)
+	 */
+	memset(temp, 0, 256);
+
+	/* Note this is only network field converted to big endian */
+	hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
+			- 4 /*  RFC 1001 length field itself not counted */);
+
+	hdr->ProtocolId[0] = 0xFE;
+	hdr->ProtocolId[1] = 'S';
+	hdr->ProtocolId[2] = 'M';
+	hdr->ProtocolId[3] = 'B';
+	hdr->StructureSize = cpu_to_le16(64);
+	hdr->Command = smb2_cmd;
+	hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
+	hdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+
+	if (!tcon)
+		goto out;
+
+	/* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
+	/* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
+	/* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
+	if ((tcon->ses) &&
+	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		hdr->CreditCharge = cpu_to_le16(1);
+	/* else CreditCharge MBZ */
+
+	hdr->TreeId = tcon->tid;
+	/* Uid is not converted */
+	if (tcon->ses)
+		hdr->SessionId = tcon->ses->Suid;
+
+	/*
+	 * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
+	 * to pass the path on the Open SMB prefixed by \\server\share.
+	 * Not sure when we would need to do the augmented path (if ever) and
+	 * setting this flag breaks the SMB2 open operation since it is
+	 * illegal to send an empty path name (without \\server\share prefix)
+	 * when the DFS flag is set in the SMB open header. We could
+	 * consider setting the flag on all operations other than open
+	 * but it is safer to net set it for now.
+	 */
+/*	if (tcon->share_flags & SHI1005_FLAGS_DFS)
+		hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+
+	if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
+		hdr->Flags |= SMB2_FLAGS_SIGNED;
+out:
+	pdu->StructureSize2 = cpu_to_le16(parmsize);
+	return;
+}
+
+static int
+smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
+{
+	int rc = 0;
+	struct nls_table *nls_codepage;
+	struct cifs_ses *ses;
+	struct TCP_Server_Info *server;
+
+	/*
+	 * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
+	 * check for tcp and smb session status done differently
+	 * for those three - in the calling routine.
+	 */
+	if (tcon == NULL)
+		return rc;
+
+	if (smb2_command == SMB2_TREE_CONNECT)
+		return rc;
+
+	if (tcon->tidStatus == CifsExiting) {
+		/*
+		 * only tree disconnect, open, and write,
+		 * (and ulogoff which does not have tcon)
+		 * are allowed as we start force umount.
+		 */
+		if ((smb2_command != SMB2_WRITE) &&
+		   (smb2_command != SMB2_CREATE) &&
+		   (smb2_command != SMB2_TREE_DISCONNECT)) {
+			cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+				 smb2_command);
+			return -ENODEV;
+		}
+	}
+	if ((!tcon->ses) || (tcon->ses->status == CifsExiting) ||
+	    (!tcon->ses->server))
+		return -EIO;
+
+	ses = tcon->ses;
+	server = ses->server;
+
+	/*
+	 * Give demultiplex thread up to 10 seconds to reconnect, should be
+	 * greater than cifs socket timeout which is 7 seconds
+	 */
+	while (server->tcpStatus == CifsNeedReconnect) {
+		/*
+		 * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
+		 * here since they are implicitly done when session drops.
+		 */
+		switch (smb2_command) {
+		/*
+		 * BB Should we keep oplock break and add flush to exceptions?
+		 */
+		case SMB2_TREE_DISCONNECT:
+		case SMB2_CANCEL:
+		case SMB2_CLOSE:
+		case SMB2_OPLOCK_BREAK:
+			return -EAGAIN;
+		}
+
+		wait_event_interruptible_timeout(server->response_q,
+			(server->tcpStatus != CifsNeedReconnect), 10 * HZ);
+
+		/* are we still trying to reconnect? */
+		if (server->tcpStatus != CifsNeedReconnect)
+			break;
+
+		/*
+		 * on "soft" mounts we wait once. Hard mounts keep
+		 * retrying until process is killed or server comes
+		 * back on-line
+		 */
+		if (!tcon->retry) {
+			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
+			return -EHOSTDOWN;
+		}
+	}
+
+	if (!tcon->ses->need_reconnect && !tcon->need_reconnect)
+		return rc;
+
+	nls_codepage = load_nls_default();
+
+	/*
+	 * need to prevent multiple threads trying to simultaneously reconnect
+	 * the same SMB session
+	 */
+	mutex_lock(&tcon->ses->session_mutex);
+	rc = cifs_negotiate_protocol(0, tcon->ses);
+	if (!rc && tcon->ses->need_reconnect)
+		rc = cifs_setup_session(0, tcon->ses, nls_codepage);
+
+	if (rc || !tcon->need_reconnect) {
+		mutex_unlock(&tcon->ses->session_mutex);
+		goto out;
+	}
+
+	cifs_mark_open_files_invalid(tcon);
+	rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
+	mutex_unlock(&tcon->ses->session_mutex);
+	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
+	if (rc)
+		goto out;
+	atomic_inc(&tconInfoReconnectCount);
+	/*
+	 * BB FIXME add code to check if wsize needs update due to negotiated
+	 * smb buffer size shrinking.
+	 */
+out:
+	/*
+	 * Check if handle based operation so we know whether we can continue
+	 * or not without returning to caller to reset file handle.
+	 */
+	/*
+	 * BB Is flush done by server on drop of tcp session? Should we special
+	 * case it and skip above?
+	 */
+	switch (smb2_command) {
+	case SMB2_FLUSH:
+	case SMB2_READ:
+	case SMB2_WRITE:
+	case SMB2_LOCK:
+	case SMB2_IOCTL:
+	case SMB2_QUERY_DIRECTORY:
+	case SMB2_CHANGE_NOTIFY:
+	case SMB2_QUERY_INFO:
+	case SMB2_SET_INFO:
+		return -EAGAIN;
+	}
+	unload_nls(nls_codepage);
+	return rc;
+}
+
+/*
+ * Allocate and return pointer to an SMB request hdr, and set basic
+ * SMB information in the SMB header. If the return code is zero, this
+ * function must have filled in request_buf pointer.
+ */
+static int
+small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
+		void **request_buf)
+{
+	int rc = 0;
+
+	rc = smb2_reconnect(smb2_command, tcon);
+	if (rc)
+		return rc;
+
+	/* BB eventually switch this to SMB2 specific small buf size */
+	*request_buf = cifs_small_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+
+	smb2_hdr_assemble((struct smb2_hdr *) *request_buf, smb2_command, tcon);
+
+	if (tcon != NULL) {
+#ifdef CONFIG_CIFS_STATS2
+		uint16_t com_code = le16_to_cpu(smb2_command);
+		cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]);
+#endif
+		cifs_stats_inc(&tcon->num_smbs_sent);
+	}
+
+	return rc;
+}
+
+static void
+free_rsp_buf(int resp_buftype, void *rsp)
+{
+	if (resp_buftype == CIFS_SMALL_BUFFER)
+		cifs_small_buf_release(rsp);
+	else if (resp_buftype == CIFS_LARGE_BUFFER)
+		cifs_buf_release(rsp);
+}
+
+
+/*
+ *
+ *	SMB2 Worker functions follow:
+ *
+ *	The general structure of the worker functions is:
+ *	1) Call smb2_init (assembles SMB2 header)
+ *	2) Initialize SMB2 command specific fields in fixed length area of SMB
+ *	3) Call smb_sendrcv2 (sends request on socket and waits for response)
+ *	4) Decode SMB2 command specific fields in the fixed length area
+ *	5) Decode variable length data area (if any for this SMB2 command type)
+ *	6) Call free smb buffer
+ *	7) return
+ *
+ */
+
+int
+SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
+{
+	struct smb2_negotiate_req *req;
+	struct smb2_negotiate_rsp *rsp;
+	struct kvec iov[1];
+	int rc = 0;
+	int resp_buftype;
+	struct TCP_Server_Info *server = ses->server;
+	int blob_offset, blob_length;
+	char *security_blob;
+	int flags = CIFS_NEG_OP;
+
+	cifs_dbg(FYI, "Negotiate protocol\n");
+
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
+	}
+
+	rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->hdr.SessionId = 0;
+
+	req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
+
+	req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */
+	inc_rfc1001_len(req, 2);
+
+	/* only one of SMB2 signing flags may be set in SMB2 request */
+	if (ses->sign)
+		req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+	else if (global_secflags & CIFSSEC_MAY_SIGN)
+		req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+	else
+		req->SecurityMode = 0;
+
+	req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
+
+	/* ClientGUID must be zero for SMB2.02 dialect */
+	if (ses->server->vals->protocol_id == SMB20_PROT_ID)
+		memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE);
+	else
+		memcpy(req->ClientGUID, server->client_guid,
+			SMB2_CLIENT_GUID_SIZE);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags);
+
+	rsp = (struct smb2_negotiate_rsp *)iov[0].iov_base;
+	/*
+	 * No tcon so can't do
+	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+	 */
+	if (rc != 0)
+		goto neg_exit;
+
+	cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
+
+	/* BB we may eventually want to match the negotiated vs. requested
+	   dialect, even though we are only requesting one at a time */
+	if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb2.0 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
+	else {
+		cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
+			 le16_to_cpu(rsp->DialectRevision));
+		rc = -EIO;
+		goto neg_exit;
+	}
+	server->dialect = le16_to_cpu(rsp->DialectRevision);
+
+	/* SMB2 only has an extended negflavor */
+	server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize),
+			       SMB2_MAX_BUFFER_SIZE);
+	server->max_read = le32_to_cpu(rsp->MaxReadSize);
+	server->max_write = le32_to_cpu(rsp->MaxWriteSize);
+	/* BB Do we need to validate the SecurityMode? */
+	server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+	server->capabilities = le32_to_cpu(rsp->Capabilities);
+	/* Internal types */
+	server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
+
+	security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
+					       &rsp->hdr);
+	/*
+	 * See MS-SMB2 section 2.2.4: if no blob, client picks default which
+	 * for us will be
+	 *	ses->sectype = RawNTLMSSP;
+	 * but for time being this is our only auth choice so doesn't matter.
+	 * We just found a server which sets blob length to zero expecting raw.
+	 */
+	if (blob_length == 0)
+		cifs_dbg(FYI, "missing security blob on negprot\n");
+
+	rc = cifs_enable_signing(server, ses->sign);
+#ifdef CONFIG_SMB2_ASN1  /* BB REMOVEME when updated asn1.c ready */
+	if (rc)
+		goto neg_exit;
+	if (blob_length)
+		rc = decode_neg_token_init(security_blob, blob_length,
+				   &server->sec_type);
+	if (rc == 1)
+		rc = 0;
+	else if (rc == 0) {
+		rc = -EIO;
+		goto neg_exit;
+	}
+#endif
+
+neg_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc = 0;
+	struct validate_negotiate_info_req vneg_inbuf;
+	struct validate_negotiate_info_rsp *pneg_rsp;
+	u32 rsplen;
+
+	cifs_dbg(FYI, "validate negotiate\n");
+
+	/*
+	 * validation ioctl must be signed, so no point sending this if we
+	 * can not sign it.  We could eventually change this to selectively
+	 * sign just this, the first and only signed request on a connection.
+	 * This is good enough for now since a user who wants better security
+	 * would also enable signing on the mount. Having validation of
+	 * negotiate info for signed connections helps reduce attack vectors
+	 */
+	if (tcon->ses->server->sign == false)
+		return 0; /* validation requires signing */
+
+	vneg_inbuf.Capabilities =
+			cpu_to_le32(tcon->ses->server->vals->req_capabilities);
+	memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid,
+					SMB2_CLIENT_GUID_SIZE);
+
+	if (tcon->ses->sign)
+		vneg_inbuf.SecurityMode =
+			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+	else if (global_secflags & CIFSSEC_MAY_SIGN)
+		vneg_inbuf.SecurityMode =
+			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+	else
+		vneg_inbuf.SecurityMode = 0;
+
+	vneg_inbuf.DialectCount = cpu_to_le16(1);
+	vneg_inbuf.Dialects[0] =
+		cpu_to_le16(tcon->ses->server->vals->protocol_id);
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+		FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+		(char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
+		(char **)&pneg_rsp, &rsplen);
+
+	if (rc != 0) {
+		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
+		return -EIO;
+	}
+
+	if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
+		cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
+		return -EIO;
+	}
+
+	/* check validate negotiate info response matches what we got earlier */
+	if (pneg_rsp->Dialect !=
+			cpu_to_le16(tcon->ses->server->vals->protocol_id))
+		goto vneg_out;
+
+	if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
+		goto vneg_out;
+
+	/* do not validate server guid because not saved at negprot time yet */
+
+	if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND |
+	      SMB2_LARGE_FILES) != tcon->ses->server->capabilities)
+		goto vneg_out;
+
+	/* validate negotiate successful */
+	cifs_dbg(FYI, "validate negotiate info successful\n");
+	return 0;
+
+vneg_out:
+	cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
+	return -EIO;
+}
+
+int
+SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+		const struct nls_table *nls_cp)
+{
+	struct smb2_sess_setup_req *req;
+	struct smb2_sess_setup_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int resp_buftype;
+	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+	struct TCP_Server_Info *server = ses->server;
+	u16 blob_length = 0;
+	char *security_blob;
+	char *ntlmssp_blob = NULL;
+	bool use_spnego = false; /* else use raw ntlmssp */
+
+	cifs_dbg(FYI, "Session Setup\n");
+
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
+	}
+
+	/*
+	 * If we are here due to reconnect, free per-smb session key
+	 * in case signing was required.
+	 */
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+
+	/*
+	 * If memory allocation is successful, caller of this function
+	 * frees it.
+	 */
+	ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+	if (!ses->ntlmssp)
+		return -ENOMEM;
+	ses->ntlmssp->sesskey_per_smbsess = true;
+
+	/* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
+	ses->sectype = RawNTLMSSP;
+
+ssetup_ntlmssp_authenticate:
+	if (phase == NtLmChallenge)
+		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+
+	rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->hdr.SessionId = 0; /* First session, not a reauthenticate */
+	req->VcNumber = 0; /* MBZ */
+	/* to enable echos and oplocks */
+	req->hdr.CreditRequest = cpu_to_le16(3);
+
+	/* only one of SMB2 signing flags may be set in SMB2 request */
+	if (server->sign)
+		req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+	else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
+		req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
+	else
+		req->SecurityMode = 0;
+
+	req->Capabilities = 0;
+	req->Channel = 0; /* MBZ */
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and 1 for pad */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+	if (phase == NtLmNegotiate) {
+		ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
+				       GFP_KERNEL);
+		if (ntlmssp_blob == NULL) {
+			rc = -ENOMEM;
+			goto ssetup_exit;
+		}
+		build_ntlmssp_negotiate_blob(ntlmssp_blob, ses);
+		if (use_spnego) {
+			/* blob_length = build_spnego_ntlmssp_blob(
+					&security_blob,
+					sizeof(struct _NEGOTIATE_MESSAGE),
+					ntlmssp_blob); */
+			/* BB eventually need to add this */
+			cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
+			rc = -EOPNOTSUPP;
+			kfree(ntlmssp_blob);
+			goto ssetup_exit;
+		} else {
+			blob_length = sizeof(struct _NEGOTIATE_MESSAGE);
+			/* with raw NTLMSSP we don't encapsulate in SPNEGO */
+			security_blob = ntlmssp_blob;
+		}
+	} else if (phase == NtLmAuthenticate) {
+		req->hdr.SessionId = ses->Suid;
+		ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
+				       GFP_KERNEL);
+		if (ntlmssp_blob == NULL) {
+			rc = -ENOMEM;
+			goto ssetup_exit;
+		}
+		rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses,
+					     nls_cp);
+		if (rc) {
+			cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n",
+				 rc);
+			goto ssetup_exit; /* BB double check error handling */
+		}
+		if (use_spnego) {
+			/* blob_length = build_spnego_ntlmssp_blob(
+							&security_blob,
+							blob_length,
+							ntlmssp_blob); */
+			cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
+			rc = -EOPNOTSUPP;
+			kfree(ntlmssp_blob);
+			goto ssetup_exit;
+		} else {
+			security_blob = ntlmssp_blob;
+		}
+	} else {
+		cifs_dbg(VFS, "illegal ntlmssp phase\n");
+		rc = -EIO;
+		goto ssetup_exit;
+	}
+
+	/* Testing shows that buffer offset must be at location of Buffer[0] */
+	req->SecurityBufferOffset =
+				cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
+					    1 /* pad */ - 4 /* rfc1001 len */);
+	req->SecurityBufferLength = cpu_to_le16(blob_length);
+	iov[1].iov_base = security_blob;
+	iov[1].iov_len = blob_length;
+
+	inc_rfc1001_len(req, blob_length - 1 /* pad */);
+
+	/* BB add code to build os and lm fields */
+
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
+			  CIFS_LOG_ERROR | CIFS_NEG_OP);
+
+	kfree(security_blob);
+	rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
+	if (resp_buftype != CIFS_NO_BUFFER &&
+	    rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
+		if (phase != NtLmNegotiate) {
+			cifs_dbg(VFS, "Unexpected more processing error\n");
+			goto ssetup_exit;
+		}
+		if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
+				le16_to_cpu(rsp->SecurityBufferOffset)) {
+			cifs_dbg(VFS, "Invalid security buffer offset %d\n",
+				 le16_to_cpu(rsp->SecurityBufferOffset));
+			rc = -EIO;
+			goto ssetup_exit;
+		}
+
+		/* NTLMSSP Negotiate sent now processing challenge (response) */
+		phase = NtLmChallenge; /* process ntlmssp challenge */
+		rc = 0; /* MORE_PROCESSING is not an error here but expected */
+		ses->Suid = rsp->hdr.SessionId;
+		rc = decode_ntlmssp_challenge(rsp->Buffer,
+				le16_to_cpu(rsp->SecurityBufferLength), ses);
+	}
+
+	/*
+	 * BB eventually add code for SPNEGO decoding of NtlmChallenge blob,
+	 * but at least the raw NTLMSSP case works.
+	 */
+	/*
+	 * No tcon so can't do
+	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+	 */
+	if (rc != 0)
+		goto ssetup_exit;
+
+	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+		cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
+ssetup_exit:
+	free_rsp_buf(resp_buftype, rsp);
+
+	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
+	if ((phase == NtLmChallenge) && (rc == 0))
+		goto ssetup_ntlmssp_authenticate;
+
+	if (!rc) {
+		mutex_lock(&server->srv_mutex);
+		if (server->sign && server->ops->generate_signingkey) {
+			rc = server->ops->generate_signingkey(ses);
+			kfree(ses->auth_key.response);
+			ses->auth_key.response = NULL;
+			if (rc) {
+				cifs_dbg(FYI,
+					"SMB3 session key generation failed\n");
+				mutex_unlock(&server->srv_mutex);
+				goto keygen_exit;
+			}
+		}
+		if (!server->session_estab) {
+			server->sequence_number = 0x2;
+			server->session_estab = true;
+		}
+		mutex_unlock(&server->srv_mutex);
+
+		cifs_dbg(FYI, "SMB2/3 session established successfully\n");
+		spin_lock(&GlobalMid_Lock);
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+keygen_exit:
+	if (!server->sign) {
+		kfree(ses->auth_key.response);
+		ses->auth_key.response = NULL;
+	}
+	kfree(ses->ntlmssp);
+
+	return rc;
+}
+
+int
+SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
+{
+	struct smb2_logoff_req *req; /* response is also trivial struct */
+	int rc = 0;
+	struct TCP_Server_Info *server;
+
+	cifs_dbg(FYI, "disconnect session %p\n", ses);
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	/* no need to send SMB logoff if uid already closed due to reconnect */
+	if (ses->need_reconnect)
+		goto smb2_session_already_dead;
+
+	rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req);
+	if (rc)
+		return rc;
+
+	 /* since no tcon, smb2_init can not do this, so do here */
+	req->hdr.SessionId = ses->Suid;
+	if (server->sign)
+		req->hdr.Flags |= SMB2_FLAGS_SIGNED;
+
+	rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
+	/*
+	 * No tcon so can't do
+	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
+	 */
+
+smb2_session_already_dead:
+	return rc;
+}
+
+static inline void cifs_stats_fail_inc(struct cifs_tcon *tcon, uint16_t code)
+{
+	cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_failed[code]);
+}
+
+#define MAX_SHARENAME_LENGTH (255 /* server */ + 80 /* share */ + 1 /* NULL */)
+
+/* These are similar values to what Windows uses */
+static inline void init_copy_chunk_defaults(struct cifs_tcon *tcon)
+{
+	tcon->max_chunks = 256;
+	tcon->max_bytes_chunk = 1048576;
+	tcon->max_bytes_copy = 16777216;
+}
+
+int
+SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
+	  struct cifs_tcon *tcon, const struct nls_table *cp)
+{
+	struct smb2_tree_connect_req *req;
+	struct smb2_tree_connect_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int resp_buftype;
+	int unc_path_len;
+	struct TCP_Server_Info *server;
+	__le16 *unc_path = NULL;
+
+	cifs_dbg(FYI, "TCON\n");
+
+	if ((ses->server) && tree)
+		server = ses->server;
+	else
+		return -EIO;
+
+	if (tcon && tcon->bad_network_name)
+		return -ENOENT;
+
+	unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
+	if (unc_path == NULL)
+		return -ENOMEM;
+
+	unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1;
+	unc_path_len *= 2;
+	if (unc_path_len < 2) {
+		kfree(unc_path);
+		return -EINVAL;
+	}
+
+	rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req);
+	if (rc) {
+		kfree(unc_path);
+		return rc;
+	}
+
+	if (tcon == NULL) {
+		/* since no tcon, smb2_init can not do this, so do here */
+		req->hdr.SessionId = ses->Suid;
+		/* if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED)
+			req->hdr.Flags |= SMB2_FLAGS_SIGNED; */
+	}
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and 1 for pad */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+
+	/* Testing shows that buffer offset must be at location of Buffer[0] */
+	req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)
+			- 1 /* pad */ - 4 /* do not count rfc1001 len field */);
+	req->PathLength = cpu_to_le16(unc_path_len - 2);
+	iov[1].iov_base = unc_path;
+	iov[1].iov_len = unc_path_len;
+
+	inc_rfc1001_len(req, unc_path_len - 1 /* pad */);
+
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
+	rsp = (struct smb2_tree_connect_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		if (tcon) {
+			cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
+			tcon->need_reconnect = true;
+		}
+		goto tcon_error_exit;
+	}
+
+	if (tcon == NULL) {
+		ses->ipc_tid = rsp->hdr.TreeId;
+		goto tcon_exit;
+	}
+
+	if (rsp->ShareType & SMB2_SHARE_TYPE_DISK)
+		cifs_dbg(FYI, "connection to disk share\n");
+	else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) {
+		tcon->ipc = true;
+		cifs_dbg(FYI, "connection to pipe share\n");
+	} else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) {
+		tcon->print = true;
+		cifs_dbg(FYI, "connection to printer\n");
+	} else {
+		cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
+		rc = -EOPNOTSUPP;
+		goto tcon_error_exit;
+	}
+
+	tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
+	tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
+	tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
+	tcon->tidStatus = CifsGood;
+	tcon->need_reconnect = false;
+	tcon->tid = rsp->hdr.TreeId;
+	strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+
+	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
+	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
+		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
+	init_copy_chunk_defaults(tcon);
+	if (tcon->ses->server->ops->validate_negotiate)
+		rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
+tcon_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	kfree(unc_path);
+	return rc;
+
+tcon_error_exit:
+	if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
+		cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
+		tcon->bad_network_name = true;
+	}
+	goto tcon_exit;
+}
+
+int
+SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	struct smb2_tree_disconnect_req *req; /* response is trivial */
+	int rc = 0;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+
+	cifs_dbg(FYI, "Tree Disconnect\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
+		return 0;
+
+	rc = small_smb2_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	rc = SendReceiveNoRsp(xid, ses, (char *)&req->hdr, 0);
+	if (rc)
+		cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE);
+
+	return rc;
+}
+
+
+static struct create_durable *
+create_durable_buf(void)
+{
+	struct create_durable *buf;
+
+	buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_durable, Data));
+	buf->ccontext.DataLength = cpu_to_le32(16);
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_durable, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DHnQ" */
+	buf->Name[0] = 'D';
+	buf->Name[1] = 'H';
+	buf->Name[2] = 'n';
+	buf->Name[3] = 'Q';
+	return buf;
+}
+
+static struct create_durable *
+create_reconnect_durable_buf(struct cifs_fid *fid)
+{
+	struct create_durable *buf;
+
+	buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_durable, Data));
+	buf->ccontext.DataLength = cpu_to_le32(16);
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_durable, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	buf->Data.Fid.PersistentFileId = fid->persistent_fid;
+	buf->Data.Fid.VolatileFileId = fid->volatile_fid;
+	/* SMB2_CREATE_DURABLE_HANDLE_RECONNECT is "DHnC" */
+	buf->Name[0] = 'D';
+	buf->Name[1] = 'H';
+	buf->Name[2] = 'n';
+	buf->Name[3] = 'C';
+	return buf;
+}
+
+static __u8
+parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
+		  unsigned int *epoch)
+{
+	char *data_offset;
+	struct create_context *cc;
+	unsigned int next = 0;
+	char *name;
+
+	data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+	cc = (struct create_context *)data_offset;
+	do {
+		cc = (struct create_context *)((char *)cc + next);
+		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
+		if (le16_to_cpu(cc->NameLength) != 4 ||
+		    strncmp(name, "RqLs", 4)) {
+			next = le32_to_cpu(cc->Next);
+			continue;
+		}
+		return server->ops->parse_lease_buf(cc, epoch);
+	} while (next != 0);
+
+	return 0;
+}
+
+static int
+add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
+		  unsigned int *num_iovec, __u8 *oplock)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock);
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = server->vals->create_lease_size;
+	req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset = cpu_to_le32(
+				sizeof(struct smb2_create_req) - 4 +
+				iov[num - 1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength,
+		     server->vals->create_lease_size);
+	inc_rfc1001_len(&req->hdr, server->vals->create_lease_size);
+	*num_iovec = num + 1;
+	return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+		    struct cifs_open_parms *oparms)
+{
+	struct smb2_create_req *req = iov[0].iov_base;
+	unsigned int num = *num_iovec;
+
+	if (oparms->reconnect) {
+		iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
+		/* indicate that we don't need to relock the file */
+		oparms->reconnect = false;
+	} else
+		iov[num].iov_base = create_durable_buf();
+	if (iov[num].iov_base == NULL)
+		return -ENOMEM;
+	iov[num].iov_len = sizeof(struct create_durable);
+	if (!req->CreateContextsOffset)
+		req->CreateContextsOffset =
+			cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+								iov[1].iov_len);
+	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
+	inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
+	*num_iovec = num + 1;
+	return 0;
+}
+
+int
+SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
+	  __u8 *oplock, struct smb2_file_all_info *buf,
+	  struct smb2_err_rsp **err_buf)
+{
+	struct smb2_create_req *req;
+	struct smb2_create_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_tcon *tcon = oparms->tcon;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[4];
+	int resp_buftype;
+	int uni_path_len;
+	__le16 *copy_path = NULL;
+	int copy_size;
+	int rc = 0;
+	unsigned int num_iovecs = 2;
+	__u32 file_attributes = 0;
+	char *dhc_buf = NULL, *lc_buf = NULL;
+
+	cifs_dbg(FYI, "create/open\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_CREATE, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	if (oparms->create_options & CREATE_OPTION_READONLY)
+		file_attributes |= ATTR_READONLY;
+
+	req->ImpersonationLevel = IL_IMPERSONATION;
+	req->DesiredAccess = cpu_to_le32(oparms->desired_access);
+	/* File attributes ignored on open (used in create though) */
+	req->FileAttributes = cpu_to_le32(file_attributes);
+	req->ShareAccess = FILE_SHARE_ALL_LE;
+	req->CreateDisposition = cpu_to_le32(oparms->disposition);
+	req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
+	uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
+	/* do not count rfc1001 len field */
+	req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	/* MUST set path len (NameLength) to 0 opening root of share */
+	req->NameLength = cpu_to_le16(uni_path_len - 2);
+	/* -1 since last byte is buf[0] which is sent below (path) */
+	iov[0].iov_len--;
+	if (uni_path_len % 8 != 0) {
+		copy_size = uni_path_len / 8 * 8;
+		if (copy_size < uni_path_len)
+			copy_size += 8;
+
+		copy_path = kzalloc(copy_size, GFP_KERNEL);
+		if (!copy_path)
+			return -ENOMEM;
+		memcpy((char *)copy_path, (const char *)path,
+			uni_path_len);
+		uni_path_len = copy_size;
+		path = copy_path;
+	}
+
+	iov[1].iov_len = uni_path_len;
+	iov[1].iov_base = path;
+	/* -1 since last byte is buf[0] which was counted in smb2_buf_len */
+	inc_rfc1001_len(req, uni_path_len - 1);
+
+	if (!server->oplocks)
+		*oplock = SMB2_OPLOCK_LEVEL_NONE;
+
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
+	    *oplock == SMB2_OPLOCK_LEVEL_NONE)
+		req->RequestedOplockLevel = *oplock;
+	else {
+		rc = add_lease_context(server, iov, &num_iovecs, oplock);
+		if (rc) {
+			cifs_small_buf_release(req);
+			kfree(copy_path);
+			return rc;
+		}
+		lc_buf = iov[num_iovecs-1].iov_base;
+	}
+
+	if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+		/* need to set Next field of lease context if we request it */
+		if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
+			struct create_context *ccontext =
+			    (struct create_context *)iov[num_iovecs-1].iov_base;
+			ccontext->Next =
+				cpu_to_le32(server->vals->create_lease_size);
+		}
+		rc = add_durable_context(iov, &num_iovecs, oparms);
+		if (rc) {
+			cifs_small_buf_release(req);
+			kfree(copy_path);
+			kfree(lc_buf);
+			return rc;
+		}
+		dhc_buf = iov[num_iovecs-1].iov_base;
+	}
+
+	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+	rsp = (struct smb2_create_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		cifs_stats_fail_inc(tcon, SMB2_CREATE_HE);
+		if (err_buf)
+			*err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4,
+					   GFP_KERNEL);
+		goto creat_exit;
+	}
+
+	oparms->fid->persistent_fid = rsp->PersistentFileId;
+	oparms->fid->volatile_fid = rsp->VolatileFileId;
+
+	if (buf) {
+		memcpy(buf, &rsp->CreationTime, 32);
+		buf->AllocationSize = rsp->AllocationSize;
+		buf->EndOfFile = rsp->EndofFile;
+		buf->Attributes = rsp->FileAttributes;
+		buf->NumberOfLinks = cpu_to_le32(1);
+		buf->DeletePending = 0;
+	}
+
+	if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
+		*oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
+	else
+		*oplock = rsp->OplockLevel;
+creat_exit:
+	kfree(copy_path);
+	kfree(lc_buf);
+	kfree(dhc_buf);
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+/*
+ *	SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	   u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
+	   u32 indatalen, char **out_data, u32 *plen /* returned data len */)
+{
+	struct smb2_ioctl_req *req;
+	struct smb2_ioctl_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[2];
+	int resp_buftype;
+	int num_iovecs;
+	int rc = 0;
+
+	cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+	*out_data = NULL;
+	/* zero out returned data len, in case of error */
+	if (plen)
+		*plen = 0;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->CtlCode = cpu_to_le32(opcode);
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	if (indatalen) {
+		req->InputCount = cpu_to_le32(indatalen);
+		/* do not set InputOffset if no input data */
+		req->InputOffset =
+		       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+		iov[1].iov_base = in_data;
+		iov[1].iov_len = indatalen;
+		num_iovecs = 2;
+	} else
+		num_iovecs = 1;
+
+	req->OutputOffset = 0;
+	req->OutputCount = 0; /* MBZ */
+
+	/*
+	 * Could increase MaxOutputResponse, but that would require more
+	 * than one credit. Windows typically sets this smaller, but for some
+	 * ioctls it may be useful to allow server to send more. No point
+	 * limiting what the server can send as long as fits in one credit
+	 */
+	req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+
+	if (is_fsctl)
+		req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+	else
+		req->Flags = 0;
+
+	iov[0].iov_base = (char *)req;
+
+	/*
+	 * If no input data, the size of ioctl struct in
+	 * protocol spec still includes a 1 byte data buffer,
+	 * but if input data passed to ioctl, we do not
+	 * want to double count this, so we do not send
+	 * the dummy one byte of data in iovec[0] if sending
+	 * input data (in iovec[1]). We also must add 4 bytes
+	 * in first iovec to allow for rfc1002 length field.
+	 */
+
+	if (indatalen) {
+		iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+		inc_rfc1001_len(req, indatalen - 1);
+	} else
+		iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+
+	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+
+	if ((rc != 0) && (rc != -EINVAL)) {
+		if (tcon)
+			cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+		goto ioctl_exit;
+	} else if (rc == -EINVAL) {
+		if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
+		    (opcode != FSCTL_SRV_COPYCHUNK)) {
+			if (tcon)
+				cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+			goto ioctl_exit;
+		}
+	}
+
+	/* check if caller wants to look at return data or just return rc */
+	if ((plen == NULL) || (out_data == NULL))
+		goto ioctl_exit;
+
+	*plen = le32_to_cpu(rsp->OutputCount);
+
+	/* We check for obvious errors in the output buffer length and offset */
+	if (*plen == 0)
+		goto ioctl_exit; /* server returned no data */
+	else if (*plen > 0xFF00) {
+		cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+		*plen = 0;
+		rc = -EIO;
+		goto ioctl_exit;
+	}
+
+	if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+		cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+			le32_to_cpu(rsp->OutputOffset));
+		*plen = 0;
+		rc = -EIO;
+		goto ioctl_exit;
+	}
+
+	*out_data = kmalloc(*plen, GFP_KERNEL);
+	if (*out_data == NULL) {
+		rc = -ENOMEM;
+		goto ioctl_exit;
+	}
+
+	memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+	       *plen);
+ioctl_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+/*
+ *   Individual callers to ioctl worker function follow
+ */
+
+int
+SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid)
+{
+	int rc;
+	struct  compress_ioctl fsctl_input;
+	char *ret_data = NULL;
+
+	fsctl_input.CompressionState =
+			__constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+			(char *)&fsctl_input /* data input */,
+			2 /* in data len */, &ret_data /* out data */, NULL);
+
+	cifs_dbg(FYI, "set compression rc %d\n", rc);
+
+	return rc;
+}
+
+int
+SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+	   u64 persistent_fid, u64 volatile_fid)
+{
+	struct smb2_close_req *req;
+	struct smb2_close_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[1];
+	int resp_buftype;
+	int rc = 0;
+
+	cifs_dbg(FYI, "Close\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+	rsp = (struct smb2_close_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		if (tcon)
+			cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+		goto close_exit;
+	}
+
+	/* BB FIXME - decode close response, update inode for caching */
+
+close_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+static int
+validate_buf(unsigned int offset, unsigned int buffer_length,
+	     struct smb2_hdr *hdr, unsigned int min_buf_size)
+
+{
+	unsigned int smb_len = be32_to_cpu(hdr->smb2_buf_length);
+	char *end_of_smb = smb_len + 4 /* RFC1001 length field */ + (char *)hdr;
+	char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr;
+	char *end_of_buf = begin_of_buf + buffer_length;
+
+
+	if (buffer_length < min_buf_size) {
+		cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n",
+			 buffer_length, min_buf_size);
+		return -EINVAL;
+	}
+
+	/* check if beyond RFC1001 maximum length */
+	if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) {
+		cifs_dbg(VFS, "buffer length %d or smb length %d too large\n",
+			 buffer_length, smb_len);
+		return -EINVAL;
+	}
+
+	if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) {
+		cifs_dbg(VFS, "illegal server response, bad offset to data\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * If SMB buffer fields are valid, copy into temporary buffer to hold result.
+ * Caller must free buffer.
+ */
+static int
+validate_and_copy_buf(unsigned int offset, unsigned int buffer_length,
+		      struct smb2_hdr *hdr, unsigned int minbufsize,
+		      char *data)
+
+{
+	char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr;
+	int rc;
+
+	if (!data)
+		return -EINVAL;
+
+	rc = validate_buf(offset, buffer_length, hdr, minbufsize);
+	if (rc)
+		return rc;
+
+	memcpy(data, begin_of_buf, buffer_length);
+
+	return 0;
+}
+
+static int
+query_info(const unsigned int xid, struct cifs_tcon *tcon,
+	   u64 persistent_fid, u64 volatile_fid, u8 info_class,
+	   size_t output_len, size_t min_len, void *data)
+{
+	struct smb2_query_info_req *req;
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int resp_buftype;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+
+	cifs_dbg(FYI, "Query Info\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->InfoType = SMB2_O_INFO_FILE;
+	req->FileInfoClass = info_class;
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+	/* 4 for rfc1002 length field and 1 for Buffer */
+	req->InputBufferOffset =
+		cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
+	req->OutputBufferLength = cpu_to_le32(output_len);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+	rsp = (struct smb2_query_info_rsp *)iov[0].iov_base;
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qinf_exit;
+	}
+
+	rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset),
+				   le32_to_cpu(rsp->OutputBufferLength),
+				   &rsp->hdr, min_len, data);
+
+qinf_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+int
+SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+		u64 persistent_fid, u64 volatile_fid,
+		struct smb2_file_all_info *data)
+{
+	return query_info(xid, tcon, persistent_fid, volatile_fid,
+			  FILE_ALL_INFORMATION,
+			  sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+			  sizeof(struct smb2_file_all_info), data);
+}
+
+int
+SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
+		 u64 persistent_fid, u64 volatile_fid, __le64 *uniqueid)
+{
+	return query_info(xid, tcon, persistent_fid, volatile_fid,
+			  FILE_INTERNAL_INFORMATION,
+			  sizeof(struct smb2_file_internal_info),
+			  sizeof(struct smb2_file_internal_info), uniqueid);
+}
+
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+smb2_echo_callback(struct mid_q_entry *mid)
+{
+	struct TCP_Server_Info *server = mid->callback_data;
+	struct smb2_echo_rsp *smb2 = (struct smb2_echo_rsp *)mid->resp_buf;
+	unsigned int credits_received = 1;
+
+	if (mid->mid_state == MID_RESPONSE_RECEIVED)
+		credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
+
+	DeleteMidQEntry(mid);
+	add_credits(server, credits_received, CIFS_ECHO_OP);
+}
+
+int
+SMB2_echo(struct TCP_Server_Info *server)
+{
+	struct smb2_echo_req *req;
+	int rc = 0;
+	struct kvec iov;
+	struct smb_rqst rqst = { .rq_iov = &iov,
+				 .rq_nvec = 1 };
+
+	cifs_dbg(FYI, "In echo request\n");
+
+	rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
+	if (rc)
+		return rc;
+
+	req->hdr.CreditRequest = cpu_to_le16(1);
+
+	iov.iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov.iov_len = get_rfc1002_length(req) + 4;
+
+	rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server,
+			     CIFS_ECHO_OP);
+	if (rc)
+		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
+
+	cifs_small_buf_release(req);
+	return rc;
+}
+
+int
+SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	   u64 volatile_fid)
+{
+	struct smb2_flush_req *req;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[1];
+	int resp_buftype;
+	int rc = 0;
+
+	cifs_dbg(FYI, "Flush\n");
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+
+	if ((rc != 0) && tcon)
+		cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
+
+	free_rsp_buf(resp_buftype, iov[0].iov_base);
+	return rc;
+}
+
+/*
+ * To form a chain of read requests, any read requests after the first should
+ * have the end_of_chain boolean set to true.
+ */
+static int
+smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms,
+		  unsigned int remaining_bytes, int request_type)
+{
+	int rc = -EACCES;
+	struct smb2_read_req *req = NULL;
+
+	rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req);
+	if (rc)
+		return rc;
+	if (io_parms->tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
+
+	req->PersistentFileId = io_parms->persistent_fid;
+	req->VolatileFileId = io_parms->volatile_fid;
+	req->ReadChannelInfoOffset = 0; /* reserved */
+	req->ReadChannelInfoLength = 0; /* reserved */
+	req->Channel = 0; /* reserved */
+	req->MinimumCount = 0;
+	req->Length = cpu_to_le32(io_parms->length);
+	req->Offset = cpu_to_le64(io_parms->offset);
+
+	if (request_type & CHAINED_REQUEST) {
+		if (!(request_type & END_OF_CHAIN)) {
+			/* 4 for rfc1002 length field */
+			req->hdr.NextCommand =
+				cpu_to_le32(get_rfc1002_length(req) + 4);
+		} else /* END_OF_CHAIN */
+			req->hdr.NextCommand = 0;
+		if (request_type & RELATED_REQUEST) {
+			req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
+			/*
+			 * Related requests use info from previous read request
+			 * in chain.
+			 */
+			req->hdr.SessionId = 0xFFFFFFFF;
+			req->hdr.TreeId = 0xFFFFFFFF;
+			req->PersistentFileId = 0xFFFFFFFF;
+			req->VolatileFileId = 0xFFFFFFFF;
+		}
+	}
+	if (remaining_bytes > io_parms->length)
+		req->RemainingBytes = cpu_to_le32(remaining_bytes);
+	else
+		req->RemainingBytes = 0;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+	return rc;
+}
+
+static void
+smb2_readv_callback(struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+	struct TCP_Server_Info *server = tcon->ses->server;
+	struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base;
+	unsigned int credits_received = 1;
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1,
+				 .rq_pages = rdata->pages,
+				 .rq_npages = rdata->nr_pages,
+				 .rq_pagesz = rdata->pagesz,
+				 .rq_tailsz = rdata->tailsz };
+
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+		 __func__, mid->mid, mid->mid_state, rdata->result,
+		 rdata->bytes);
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		credits_received = le16_to_cpu(buf->CreditRequest);
+		/* result already set, check signature */
+		if (server->sign) {
+			int rc;
+
+			rc = smb2_verify_signature(&rqst, server);
+			if (rc)
+				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+					 rc);
+		}
+		/* FIXME: should this be counted toward the initiating task? */
+		task_io_account_read(rdata->bytes);
+		cifs_stats_bytes_read(tcon, rdata->bytes);
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		rdata->result = -EAGAIN;
+		break;
+	default:
+		if (rdata->result != -ENODATA)
+			rdata->result = -EIO;
+	}
+
+	if (rdata->result)
+		cifs_stats_fail_inc(tcon, SMB2_READ_HE);
+
+	queue_work(cifsiod_wq, &rdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(server, credits_received, 0);
+}
+
+/* smb2_async_readv - send an async write, and set up mid to handle result */
+int
+smb2_async_readv(struct cifs_readdata *rdata)
+{
+	int rc;
+	struct smb2_hdr *buf;
+	struct cifs_io_parms io_parms;
+	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
+				 .rq_nvec = 1 };
+
+	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+		 __func__, rdata->offset, rdata->bytes);
+
+	io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
+	io_parms.offset = rdata->offset;
+	io_parms.length = rdata->bytes;
+	io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
+	io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
+	io_parms.pid = rdata->pid;
+	rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
+	if (rc)
+		return rc;
+
+	buf = (struct smb2_hdr *)rdata->iov.iov_base;
+	/* 4 for rfc1002 length field */
+	rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+
+	kref_get(&rdata->refcount);
+	rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
+			     cifs_readv_receive, smb2_readv_callback,
+			     rdata, 0);
+	if (rc) {
+		kref_put(&rdata->refcount, cifs_readdata_release);
+		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
+	}
+
+	cifs_small_buf_release(buf);
+	return rc;
+}
+
+int
+SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
+	  unsigned int *nbytes, char **buf, int *buf_type)
+{
+	int resp_buftype, rc = -EACCES;
+	struct smb2_read_rsp *rsp = NULL;
+	struct kvec iov[1];
+
+	*nbytes = 0;
+	rc = smb2_new_read_req(iov, io_parms, 0, 0);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1,
+			  &resp_buftype, CIFS_LOG_ERROR);
+
+	rsp = (struct smb2_read_rsp *)iov[0].iov_base;
+
+	if (rsp->hdr.Status == STATUS_END_OF_FILE) {
+		free_rsp_buf(resp_buftype, iov[0].iov_base);
+		return 0;
+	}
+
+	if (rc) {
+		cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
+		cifs_dbg(VFS, "Send error in read = %d\n", rc);
+	} else {
+		*nbytes = le32_to_cpu(rsp->DataLength);
+		if ((*nbytes > CIFS_MAX_MSGSIZE) ||
+		    (*nbytes > io_parms->length)) {
+			cifs_dbg(FYI, "bad length %d for count %d\n",
+				 *nbytes, io_parms->length);
+			rc = -EIO;
+			*nbytes = 0;
+		}
+	}
+
+	if (*buf) {
+		memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
+		       *nbytes);
+		free_rsp_buf(resp_buftype, iov[0].iov_base);
+	} else if (resp_buftype != CIFS_NO_BUFFER) {
+		*buf = iov[0].iov_base;
+		if (resp_buftype == CIFS_SMALL_BUFFER)
+			*buf_type = CIFS_SMALL_BUFFER;
+		else if (resp_buftype == CIFS_LARGE_BUFFER)
+			*buf_type = CIFS_LARGE_BUFFER;
+	}
+	return rc;
+}
+
+/*
+ * Check the mid_state and signature on received buffer (if any), and queue the
+ * workqueue completion task.
+ */
+static void
+smb2_writev_callback(struct mid_q_entry *mid)
+{
+	struct cifs_writedata *wdata = mid->callback_data;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	unsigned int written;
+	struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
+	unsigned int credits_received = 1;
+
+	switch (mid->mid_state) {
+	case MID_RESPONSE_RECEIVED:
+		credits_received = le16_to_cpu(rsp->hdr.CreditRequest);
+		wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+		if (wdata->result != 0)
+			break;
+
+		written = le32_to_cpu(rsp->DataLength);
+		/*
+		 * Mask off high 16 bits when bytes written as returned
+		 * by the server is greater than bytes requested by the
+		 * client. OS/2 servers are known to set incorrect
+		 * CountHigh values.
+		 */
+		if (written > wdata->bytes)
+			written &= 0xFFFF;
+
+		if (written < wdata->bytes)
+			wdata->result = -ENOSPC;
+		else
+			wdata->bytes = written;
+		break;
+	case MID_REQUEST_SUBMITTED:
+	case MID_RETRY_NEEDED:
+		wdata->result = -EAGAIN;
+		break;
+	default:
+		wdata->result = -EIO;
+		break;
+	}
+
+	if (wdata->result)
+		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+
+	queue_work(cifsiod_wq, &wdata->work);
+	DeleteMidQEntry(mid);
+	add_credits(tcon->ses->server, credits_received, 0);
+}
+
+/* smb2_async_writev - send an async write, and set up mid to handle result */
+int
+smb2_async_writev(struct cifs_writedata *wdata,
+		  void (*release)(struct kref *kref))
+{
+	int rc = -EACCES;
+	struct smb2_write_req *req = NULL;
+	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+	struct kvec iov;
+	struct smb_rqst rqst;
+
+	rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
+	if (rc)
+		goto async_writev_out;
+
+	req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
+
+	req->PersistentFileId = wdata->cfile->fid.persistent_fid;
+	req->VolatileFileId = wdata->cfile->fid.volatile_fid;
+	req->WriteChannelInfoOffset = 0;
+	req->WriteChannelInfoLength = 0;
+	req->Channel = 0;
+	req->Offset = cpu_to_le64(wdata->offset);
+	/* 4 for rfc1002 length field */
+	req->DataOffset = cpu_to_le16(
+				offsetof(struct smb2_write_req, Buffer) - 4);
+	req->RemainingBytes = 0;
+
+	/* 4 for rfc1002 length field and 1 for Buffer */
+	iov.iov_len = get_rfc1002_length(req) + 4 - 1;
+	iov.iov_base = req;
+
+	rqst.rq_iov = &iov;
+	rqst.rq_nvec = 1;
+	rqst.rq_pages = wdata->pages;
+	rqst.rq_npages = wdata->nr_pages;
+	rqst.rq_pagesz = wdata->pagesz;
+	rqst.rq_tailsz = wdata->tailsz;
+
+	cifs_dbg(FYI, "async write at %llu %u bytes\n",
+		 wdata->offset, wdata->bytes);
+
+	req->Length = cpu_to_le32(wdata->bytes);
+
+	inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+
+	kref_get(&wdata->refcount);
+	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
+				smb2_writev_callback, wdata, 0);
+
+	if (rc) {
+		kref_put(&wdata->refcount, release);
+		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+	}
+
+async_writev_out:
+	cifs_small_buf_release(req);
+	return rc;
+}
+
+/*
+ * SMB2_write function gets iov pointer to kvec array with n_vec as a length.
+ * The length field from io_parms must be at least 1 and indicates a number of
+ * elements with data to write that begins with position 1 in iov array. All
+ * data length is specified by count.
+ */
+int
+SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
+	   unsigned int *nbytes, struct kvec *iov, int n_vec)
+{
+	int rc = 0;
+	struct smb2_write_req *req = NULL;
+	struct smb2_write_rsp *rsp = NULL;
+	int resp_buftype;
+	*nbytes = 0;
+
+	if (n_vec < 1)
+		return rc;
+
+	rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	if (io_parms->tcon->ses->server == NULL)
+		return -ECONNABORTED;
+
+	req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
+
+	req->PersistentFileId = io_parms->persistent_fid;
+	req->VolatileFileId = io_parms->volatile_fid;
+	req->WriteChannelInfoOffset = 0;
+	req->WriteChannelInfoLength = 0;
+	req->Channel = 0;
+	req->Length = cpu_to_le32(io_parms->length);
+	req->Offset = cpu_to_le64(io_parms->offset);
+	/* 4 for rfc1002 length field */
+	req->DataOffset = cpu_to_le16(
+				offsetof(struct smb2_write_req, Buffer) - 4);
+	req->RemainingBytes = 0;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and 1 for Buffer */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+
+	/* length of entire message including data to be written */
+	inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */);
+
+	rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1,
+			  &resp_buftype, 0);
+	rsp = (struct smb2_write_rsp *)iov[0].iov_base;
+
+	if (rc) {
+		cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
+		cifs_dbg(VFS, "Send error in write = %d\n", rc);
+	} else
+		*nbytes = le32_to_cpu(rsp->DataLength);
+
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+static unsigned int
+num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
+{
+	int len;
+	unsigned int entrycount = 0;
+	unsigned int next_offset = 0;
+	FILE_DIRECTORY_INFO *entryptr;
+
+	if (bufstart == NULL)
+		return 0;
+
+	entryptr = (FILE_DIRECTORY_INFO *)bufstart;
+
+	while (1) {
+		entryptr = (FILE_DIRECTORY_INFO *)
+					((char *)entryptr + next_offset);
+
+		if ((char *)entryptr + size > end_of_buf) {
+			cifs_dbg(VFS, "malformed search entry would overflow\n");
+			break;
+		}
+
+		len = le32_to_cpu(entryptr->FileNameLength);
+		if ((char *)entryptr + len + size > end_of_buf) {
+			cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
+				 end_of_buf);
+			break;
+		}
+
+		*lastentry = (char *)entryptr;
+		entrycount++;
+
+		next_offset = le32_to_cpu(entryptr->NextEntryOffset);
+		if (!next_offset)
+			break;
+	}
+
+	return entrycount;
+}
+
+/*
+ * Readdir/FindFirst
+ */
+int
+SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid, int index,
+		     struct cifs_search_info *srch_inf)
+{
+	struct smb2_query_directory_req *req;
+	struct smb2_query_directory_rsp *rsp = NULL;
+	struct kvec iov[2];
+	int rc = 0;
+	int len;
+	int resp_buftype;
+	unsigned char *bufptr;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	__le16 asteriks = cpu_to_le16('*');
+	char *end_of_smb;
+	unsigned int output_size = CIFSMaxBufSize;
+	size_t info_buf_size;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	switch (srch_inf->info_level) {
+	case SMB_FIND_FILE_DIRECTORY_INFO:
+		req->FileInformationClass = FILE_DIRECTORY_INFORMATION;
+		info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1;
+		break;
+	case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+		req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION;
+		info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
+		break;
+	default:
+		cifs_dbg(VFS, "info level %u isn't supported\n",
+			 srch_inf->info_level);
+		rc = -EINVAL;
+		goto qdir_exit;
+	}
+
+	req->FileIndex = cpu_to_le32(index);
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	len = 0x2;
+	bufptr = req->Buffer;
+	memcpy(bufptr, &asteriks, len);
+
+	req->FileNameOffset =
+		cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4);
+	req->FileNameLength = cpu_to_le16(len);
+	/*
+	 * BB could be 30 bytes or so longer if we used SMB2 specific
+	 * buffer lengths, but this is safe and close enough.
+	 */
+	output_size = min_t(unsigned int, output_size, server->maxBuf);
+	output_size = min_t(unsigned int, output_size, 2 << 15);
+	req->OutputBufferLength = cpu_to_le32(output_size);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for RFC1001 length and 1 for Buffer */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+
+	iov[1].iov_base = (char *)(req->Buffer);
+	iov[1].iov_len = len;
+
+	inc_rfc1001_len(req, len - 1 /* Buffer */);
+
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
+	rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+		goto qdir_exit;
+	}
+
+	rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
+			  le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
+			  info_buf_size);
+	if (rc)
+		goto qdir_exit;
+
+	srch_inf->unicode = true;
+
+	if (srch_inf->ntwrk_buf_start) {
+		if (srch_inf->smallBuf)
+			cifs_small_buf_release(srch_inf->ntwrk_buf_start);
+		else
+			cifs_buf_release(srch_inf->ntwrk_buf_start);
+	}
+	srch_inf->ntwrk_buf_start = (char *)rsp;
+	srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ +
+		(char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset);
+	/* 4 for rfc1002 length field */
+	end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr;
+	srch_inf->entries_in_buffer =
+			num_entries(srch_inf->srch_entries_start, end_of_smb,
+				    &srch_inf->last_entry, info_buf_size);
+	srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
+	cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n",
+		 srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
+		 srch_inf->srch_entries_start, srch_inf->last_entry);
+	if (resp_buftype == CIFS_LARGE_BUFFER)
+		srch_inf->smallBuf = false;
+	else if (resp_buftype == CIFS_SMALL_BUFFER)
+		srch_inf->smallBuf = true;
+	else
+		cifs_dbg(VFS, "illegal search buffer type\n");
+
+	if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
+		srch_inf->endOfSearch = 1;
+	else
+		srch_inf->endOfSearch = 0;
+
+	return rc;
+
+qdir_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
+static int
+send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+	       u64 persistent_fid, u64 volatile_fid, u32 pid, int info_class,
+	       unsigned int num, void **data, unsigned int *size)
+{
+	struct smb2_set_info_req *req;
+	struct smb2_set_info_rsp *rsp = NULL;
+	struct kvec *iov;
+	int rc = 0;
+	int resp_buftype;
+	unsigned int i;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	if (!num)
+		return -EINVAL;
+
+	iov = kmalloc(sizeof(struct kvec) * num, GFP_KERNEL);
+	if (!iov)
+		return -ENOMEM;
+
+	rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req);
+	if (rc) {
+		kfree(iov);
+		return rc;
+	}
+
+	req->hdr.ProcessId = cpu_to_le32(pid);
+
+	req->InfoType = SMB2_O_INFO_FILE;
+	req->FileInfoClass = info_class;
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	/* 4 for RFC1001 length and 1 for Buffer */
+	req->BufferOffset =
+			cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4);
+	req->BufferLength = cpu_to_le32(*size);
+
+	inc_rfc1001_len(req, *size - 1 /* Buffer */);
+
+	memcpy(req->Buffer, *data, *size);
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for RFC1001 length */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	for (i = 1; i < num; i++) {
+		inc_rfc1001_len(req, size[i]);
+		le32_add_cpu(&req->BufferLength, size[i]);
+		iov[i].iov_base = (char *)data[i];
+		iov[i].iov_len = size[i];
+	}
+
+	rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
+	rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
+
+	if (rc != 0)
+		cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
+
+	free_rsp_buf(resp_buftype, rsp);
+	kfree(iov);
+	return rc;
+}
+
+int
+SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
+	    u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
+{
+	struct smb2_file_rename_info info;
+	void **data;
+	unsigned int size[2];
+	int rc;
+	int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
+
+	data = kmalloc(sizeof(void *) * 2, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	info.ReplaceIfExists = 1; /* 1 = replace existing target with new */
+			      /* 0 = fail if target already exists */
+	info.RootDirectory = 0;  /* MBZ for network ops (why does spec say?) */
+	info.FileNameLength = cpu_to_le32(len);
+
+	data[0] = &info;
+	size[0] = sizeof(struct smb2_file_rename_info);
+
+	data[1] = target_file;
+	size[1] = len + 2 /* null */;
+
+	rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			   current->tgid, FILE_RENAME_INFORMATION, 2, data,
+			   size);
+	kfree(data);
+	return rc;
+}
+
+int
+SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+		  u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
+{
+	struct smb2_file_link_info info;
+	void **data;
+	unsigned int size[2];
+	int rc;
+	int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
+
+	data = kmalloc(sizeof(void *) * 2, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	info.ReplaceIfExists = 0; /* 1 = replace existing link with new */
+			      /* 0 = fail if link already exists */
+	info.RootDirectory = 0;  /* MBZ for network ops (why does spec say?) */
+	info.FileNameLength = cpu_to_le32(len);
+
+	data[0] = &info;
+	size[0] = sizeof(struct smb2_file_link_info);
+
+	data[1] = target_file;
+	size[1] = len + 2 /* null */;
+
+	rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			   current->tgid, FILE_LINK_INFORMATION, 2, data, size);
+	kfree(data);
+	return rc;
+}
+
+int
+SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	     u64 volatile_fid, u32 pid, __le64 *eof)
+{
+	struct smb2_file_eof_info info;
+	void *data;
+	unsigned int size;
+
+	info.EndOfFile = *eof;
+
+	data = &info;
+	size = sizeof(struct smb2_file_eof_info);
+
+	return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid,
+			     FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
+}
+
+int
+SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf)
+{
+	unsigned int size;
+	size = sizeof(FILE_BASIC_INFO);
+	return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+			     current->tgid, FILE_BASIC_INFORMATION, 1,
+			     (void **)&buf, &size);
+}
+
+int
+SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
+		  const u64 persistent_fid, const u64 volatile_fid,
+		  __u8 oplock_level)
+{
+	int rc;
+	struct smb2_oplock_break *req = NULL;
+
+	cifs_dbg(FYI, "SMB2_oplock_break\n");
+	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
+
+	if (rc)
+		return rc;
+
+	req->VolatileFid = volatile_fid;
+	req->PersistentFid = persistent_fid;
+	req->OplockLevel = oplock_level;
+	req->hdr.CreditRequest = cpu_to_le16(1);
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
+	/* SMB2 buffer freed by function above */
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+		cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
+	}
+
+	return rc;
+}
+
+static void
+copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf,
+			struct kstatfs *kst)
+{
+	kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) *
+			  le32_to_cpu(pfs_inf->SectorsPerAllocationUnit);
+	kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits);
+	kst->f_bfree  = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits);
+	kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits);
+	return;
+}
+
+static int
+build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
+		   int outbuf_len, u64 persistent_fid, u64 volatile_fid)
+{
+	int rc;
+	struct smb2_query_info_req *req;
+
+	cifs_dbg(FYI, "Query FSInfo level %d\n", level);
+
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->InfoType = SMB2_O_INFO_FILESYSTEM;
+	req->FileInfoClass = level;
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+	/* 4 for rfc1002 length field and 1 for pad */
+	req->InputBufferOffset =
+			cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4);
+	req->OutputBufferLength = cpu_to_le32(
+		outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4);
+
+	iov->iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov->iov_len = get_rfc1002_length(req) + 4;
+	return 0;
+}
+
+int
+SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
+{
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov;
+	int rc = 0;
+	int resp_buftype;
+	struct cifs_ses *ses = tcon->ses;
+	struct smb2_fs_full_size_info *info = NULL;
+
+	rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION,
+				sizeof(struct smb2_fs_full_size_info),
+				persistent_fid, volatile_fid);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qfsinf_exit;
+	}
+	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+	info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ +
+		le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
+	rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
+			  le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
+			  sizeof(struct smb2_fs_full_size_info));
+	if (!rc)
+		copy_fs_info_to_kstatfs(info, fsdata);
+
+qfsinf_exit:
+	free_rsp_buf(resp_buftype, iov.iov_base);
+	return rc;
+}
+
+int
+SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, int level)
+{
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov;
+	int rc = 0;
+	int resp_buftype, max_len, min_len;
+	struct cifs_ses *ses = tcon->ses;
+	unsigned int rsp_len, offset;
+
+	if (level == FS_DEVICE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+		min_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+	} else if (level == FS_ATTRIBUTE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO);
+		min_len = MIN_FS_ATTR_INFO_SIZE;
+	} else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		max_len = sizeof(struct smb3_fs_ss_info);
+		min_len = sizeof(struct smb3_fs_ss_info);
+	} else {
+		cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level);
+		return -EINVAL;
+	}
+
+	rc = build_qfs_info_req(&iov, tcon, level, max_len,
+				persistent_fid, volatile_fid);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qfsattr_exit;
+	}
+	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
+	offset = le16_to_cpu(rsp->OutputBufferOffset);
+	rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len);
+	if (rc)
+		goto qfsattr_exit;
+
+	if (level == FS_ATTRIBUTE_INFORMATION)
+		memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, min_t(unsigned int,
+			rsp_len, max_len));
+	else if (level == FS_DEVICE_INFORMATION)
+		memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+	else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
+			(4 /* RFC1001 len */ + offset + (char *)&rsp->hdr);
+		tcon->ss_flags = le32_to_cpu(ss_info->Flags);
+		tcon->perf_sector_size =
+			le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
+	}
+
+qfsattr_exit:
+	free_rsp_buf(resp_buftype, iov.iov_base);
+	return rc;
+}
+
+int
+smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+	   const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid,
+	   const __u32 num_lock, struct smb2_lock_element *buf)
+{
+	int rc = 0;
+	struct smb2_lock_req *req = NULL;
+	struct kvec iov[2];
+	int resp_buf_type;
+	unsigned int count;
+
+	cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
+
+	rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->hdr.ProcessId = cpu_to_le32(pid);
+	req->LockCount = cpu_to_le16(num_lock);
+
+	req->PersistentFileId = persist_fid;
+	req->VolatileFileId = volatile_fid;
+
+	count = num_lock * sizeof(struct smb2_lock_element);
+	inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element));
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field and count for all locks */
+	iov[0].iov_len = get_rfc1002_length(req) + 4 - count;
+	iov[1].iov_base = (char *)buf;
+	iov[1].iov_len = count;
+
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
+		cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
+	}
+
+	return rc;
+}
+
+int
+SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
+	  const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid,
+	  const __u64 length, const __u64 offset, const __u32 lock_flags,
+	  const bool wait)
+{
+	struct smb2_lock_element lock;
+
+	lock.Offset = cpu_to_le64(offset);
+	lock.Length = cpu_to_le64(length);
+	lock.Flags = cpu_to_le32(lock_flags);
+	if (!wait && lock_flags != SMB2_LOCKFLAG_UNLOCK)
+		lock.Flags |= cpu_to_le32(SMB2_LOCKFLAG_FAIL_IMMEDIATELY);
+
+	return smb2_lockv(xid, tcon, persist_fid, volatile_fid, pid, 1, &lock);
+}
+
+int
+SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
+		 __u8 *lease_key, const __le32 lease_state)
+{
+	int rc;
+	struct smb2_lease_ack *req = NULL;
+
+	cifs_dbg(FYI, "SMB2_lease_break\n");
+	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
+
+	if (rc)
+		return rc;
+
+	req->hdr.CreditRequest = cpu_to_le16(1);
+	req->StructureSize = cpu_to_le16(36);
+	inc_rfc1001_len(req, 12);
+
+	memcpy(req->LeaseKey, lease_key, 16);
+	req->LeaseState = lease_state;
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
+	/* SMB2 buffer freed by function above */
+
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+		cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
+	}
+
+	return rc;
+}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
new file mode 100644
index 00000000000..69f3595d395
--- /dev/null
+++ b/fs/cifs/smb2pdu.h
@@ -0,0 +1,1053 @@
+/*
+ *   fs/cifs/smb2pdu.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2009, 2013
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _SMB2PDU_H
+#define _SMB2PDU_H
+
+#include <net/sock.h>
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below.  Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * be able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE	0x0000
+#define SMB2_SESSION_SETUP_HE	0x0001
+#define SMB2_LOGOFF_HE		0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE	0x0003
+#define SMB2_TREE_DISCONNECT_HE	0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE		0x0005
+#define SMB2_CLOSE_HE		0x0006
+#define SMB2_FLUSH_HE		0x0007 /* trivial resp */
+#define SMB2_READ_HE		0x0008
+#define SMB2_WRITE_HE		0x0009
+#define SMB2_LOCK_HE		0x000A
+#define SMB2_IOCTL_HE		0x000B
+#define SMB2_CANCEL_HE		0x000C
+#define SMB2_ECHO_HE		0x000D
+#define SMB2_QUERY_DIRECTORY_HE	0x000E
+#define SMB2_CHANGE_NOTIFY_HE	0x000F
+#define SMB2_QUERY_INFO_HE	0x0010
+#define SMB2_SET_INFO_HE	0x0011
+#define SMB2_OPLOCK_BREAK_HE	0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE		cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP	cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF		cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT	cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT	cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE		cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE		cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH		cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ		cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE		cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK		cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL		cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL		cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO		cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY	cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY	cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO		cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO		cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK	cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+#define NUMBER_OF_SMB2_COMMANDS	0x0013
+
+/* BB FIXME - analyze following length BB */
+#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
+
+#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe)
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" :  Must be Zero
+ * "BB"  :  BugBug, Something to check/review/analyze later
+ * "PDU" :  "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
+
+struct smb2_hdr {
+	__be32 smb2_buf_length;	/* big endian on wire */
+				/* length is only two or three bytes - with
+				 one or two byte type preceding it that MBZ */
+	__u8   ProtocolId[4];	/* 0xFE 'S' 'M' 'B' */
+	__le16 StructureSize;	/* 64 */
+	__le16 CreditCharge;	/* MBZ */
+	__le32 Status;		/* Error from server */
+	__le16 Command;
+	__le16 CreditRequest;  /* CreditResponse */
+	__le32 Flags;
+	__le32 NextCommand;
+	__u64  MessageId;	/* opaque - so can stay little endian */
+	__le32 ProcessId;
+	__u32  TreeId;		/* opaque - so do not make little endian */
+	__u64  SessionId;	/* opaque - so do not make little endian */
+	__u8   Signature[16];
+} __packed;
+
+struct smb2_pdu {
+	struct smb2_hdr hdr;
+	__le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+struct smb2_transform_hdr {
+	__be32 smb2_buf_length;	/* big endian on wire */
+				/* length is only two or three bytes - with
+				 one or two byte type preceding it that MBZ */
+	__u8   ProtocolId[4];	/* 0xFD 'S' 'M' 'B' */
+	__u8   Signature[16];
+	__u8   Nonce[11];
+	__u8   Reserved[5];
+	__le32 OriginalMessageSize;
+	__u16  Reserved1;
+	__le16 EncryptionAlgorithm;
+	__u64  SessionId;
+} __packed;
+
+/* Encryption Algorithms */
+#define SMB2_ENCRYPTION_AES128_CCM	__constant_cpu_to_le16(0x0001)
+
+/*
+ *	SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR	__constant_cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND	__constant_cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS	__constant_cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED		__constant_cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_DFS_OPERATIONS	__constant_cpu_to_le32(0x10000000)
+
+/*
+ *	Definitions for SMB2 Protocol Data Units (network frames)
+ *
+ *  See MS-SMB2.PDF specification for protocol details.
+ *  The Naming convention is the lower case version of the SMB2
+ *  command code name for the struct. Note that structures must be packed.
+ *
+ */
+
+#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
+
+struct smb2_err_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;
+	__le16 Reserved; /* MBZ */
+	__le32 ByteCount;  /* even if zero, at least one byte follows */
+	__u8   ErrorData[1];  /* variable length */
+} __packed;
+
+struct smb2_symlink_err_rsp {
+	__le32 SymLinkLength;
+	__le32 SymLinkErrorTag;
+	__le32 ReparseTag;
+	__le16 ReparseDataLength;
+	__le16 UnparsedPathLength;
+	__le16 SubstituteNameOffset;
+	__le16 SubstituteNameLength;
+	__le16 PrintNameOffset;
+	__le16 PrintNameLength;
+	__le32 Flags;
+	__u8  PathBuffer[0];
+} __packed;
+
+#define SMB2_CLIENT_GUID_SIZE 16
+
+struct smb2_negotiate_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 36 */
+	__le16 DialectCount;
+	__le16 SecurityMode;
+	__le16 Reserved;	/* MBZ */
+	__le32 Capabilities;
+	__u8   ClientGUID[SMB2_CLIENT_GUID_SIZE];
+	__le64 ClientStartTime;	/* MBZ */
+	__le16 Dialects[1]; /* One dialect (vers=) at a time for now */
+} __packed;
+
+/* Dialects */
+#define SMB20_PROT_ID 0x0202
+#define SMB21_PROT_ID 0x0210
+#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
+#define BAD_PROT_ID   0xFFFF
+
+/* SecurityMode flags */
+#define	SMB2_NEGOTIATE_SIGNING_ENABLED	0x0001
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED	0x0002
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS		0x00000001
+#define SMB2_GLOBAL_CAP_LEASING		0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU	0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL	0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING  0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION	0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND			0x00100000
+#define SMB2_LARGE_FILES		0x00200000
+
+struct smb2_negotiate_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 65 */
+	__le16 SecurityMode;
+	__le16 DialectRevision;
+	__le16 Reserved;	/* MBZ */
+	__u8   ServerGUID[16];
+	__le32 Capabilities;
+	__le32 MaxTransactSize;
+	__le32 MaxReadSize;
+	__le32 MaxWriteSize;
+	__le64 SystemTime;	/* MBZ */
+	__le64 ServerStartTime;
+	__le16 SecurityBufferOffset;
+	__le16 SecurityBufferLength;
+	__le32 Reserved2;	/* may be any value, ignore */
+	__u8   Buffer[1];	/* variable length GSS security buffer */
+} __packed;
+
+struct smb2_sess_setup_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 25 */
+	__u8   VcNumber;
+	__u8   SecurityMode;
+	__le32 Capabilities;
+	__le32 Channel;
+	__le16 SecurityBufferOffset;
+	__le16 SecurityBufferLength;
+	__le64 PreviousSessionId;
+	__u8   Buffer[1];	/* variable length GSS security buffer */
+} __packed;
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST	0x0001
+#define SMB2_SESSION_FLAG_IS_NULL	0x0002
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA	0x0004
+struct smb2_sess_setup_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 9 */
+	__le16 SessionFlags;
+	__le16 SecurityBufferOffset;
+	__le16 SecurityBufferLength;
+	__u8   Buffer[1];	/* variable length GSS security buffer */
+} __packed;
+
+struct smb2_logoff_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_tree_connect_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 9 */
+	__le16 Reserved;
+	__le16 PathOffset;
+	__le16 PathLength;
+	__u8   Buffer[1];	/* variable length */
+} __packed;
+
+struct smb2_tree_connect_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 16 */
+	__u8   ShareType;  /* see below */
+	__u8   Reserved;
+	__le32 ShareFlags; /* see below */
+	__le32 Capabilities; /* see below */
+	__le32 MaximalAccess;
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK	0x01
+#define SMB2_SHARE_TYPE_PIPE	0x02
+#define	SMB2_SHARE_TYPE_PRINT	0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING			0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING			0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING			0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING			0x00000030
+#define SHI1005_FLAGS_DFS				0x00000001
+#define SHI1005_FLAGS_DFS_ROOT				0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS		0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE		0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING		0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM	0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK		0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH_V1			0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2			0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA			0x00008000
+#define SHI1005_FLAGS_ALL				0x0000FF33
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS	cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT	cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER	cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+
+struct smb2_tree_disconnect_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+/* File Attrubutes */
+#define FILE_ATTRIBUTE_READONLY			0x00000001
+#define FILE_ATTRIBUTE_HIDDEN			0x00000002
+#define FILE_ATTRIBUTE_SYSTEM			0x00000004
+#define FILE_ATTRIBUTE_DIRECTORY		0x00000010
+#define FILE_ATTRIBUTE_ARCHIVE			0x00000020
+#define FILE_ATTRIBUTE_NORMAL			0x00000080
+#define FILE_ATTRIBUTE_TEMPORARY		0x00000100
+#define FILE_ATTRIBUTE_SPARSE_FILE		0x00000200
+#define FILE_ATTRIBUTE_REPARSE_POINT		0x00000400
+#define FILE_ATTRIBUTE_COMPRESSED		0x00000800
+#define FILE_ATTRIBUTE_OFFLINE			0x00001000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED	0x00002000
+#define FILE_ATTRIBUTE_ENCRYPTED		0x00004000
+
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE		0x00
+#define SMB2_OPLOCK_LEVEL_II		0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE	0x08
+#define SMB2_OPLOCK_LEVEL_BATCH		0x09
+#define SMB2_OPLOCK_LEVEL_LEASE		0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE	0x99
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE		cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE		cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE		cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE			cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE		cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE			cpu_to_le32(0x00000020)
+#define FILE_READ_ATTRIBUTES_LE		cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE	cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE			cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE		cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE		cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE		cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE		cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE	cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE		cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE		cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE		cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE		cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE		cpu_to_le32(0x80000000)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE		cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE		cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE		cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE		cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE		cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE			cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE			cpu_to_le32(0x00000002)
+#define	FILE_OPEN_IF_LE			cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE		cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE		cpu_to_le32(0x00000005)
+
+/* CreateOptions Flags */
+#define FILE_DIRECTORY_FILE_LE		cpu_to_le32(0x00000001)
+/* same as #define CREATE_NOT_FILE_LE	cpu_to_le32(0x00000001) */
+#define FILE_WRITE_THROUGH_LE		cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE		cpu_to_le32(0x00000004)
+#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008)
+#define FILE_SYNCHRONOUS_IO_ALERT_LE	cpu_to_le32(0x00000010)
+#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE	cpu_to_le32(0x00000020)
+#define FILE_NON_DIRECTORY_FILE_LE	cpu_to_le32(0x00000040)
+#define FILE_COMPLETE_IF_OPLOCKED_LE	cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE		cpu_to_le32(0x00000200)
+#define FILE_RANDOM_ACCESS_LE		cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE		cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE		cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE	cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE		cpu_to_le32(0x00008000)
+#define FILE_RESERVE_OPFILTER_LE	cpu_to_le32(0x00100000)
+#define FILE_OPEN_REPARSE_POINT_LE	cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE		cpu_to_le32(0x00400000)
+#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
+
+#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
+			| FILE_READ_ATTRIBUTES_LE)
+#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
+			| FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
+#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
+
+/* Impersonation Levels */
+#define IL_ANONYMOUS		cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION	cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION	cpu_to_le32(0x00000002)
+#define IL_DELEGATE		cpu_to_le32(0x00000003)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER			"ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER			"SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST	"DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT	"DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE		"AISi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST		"TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID		"QFid"
+#define SMB2_CREATE_REQUEST_LEASE		"RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2	"DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2	"DH2C"
+#define SMB2_CREATE_APP_INSTANCE_ID	0x45BCA66AEFA7F74A9008FA462E144D74
+#define SVHDX_OPEN_DEVICE_CONTEXT	0x83CE6F1AD851E0986E34401CC9BCFCE9
+
+struct smb2_create_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u8   SecurityFlags;
+	__u8   RequestedOplockLevel;
+	__le32 ImpersonationLevel;
+	__le64 SmbCreateFlags;
+	__le64 Reserved;
+	__le32 DesiredAccess;
+	__le32 FileAttributes;
+	__le32 ShareAccess;
+	__le32 CreateDisposition;
+	__le32 CreateOptions;
+	__le16 NameOffset;
+	__le16 NameLength;
+	__le32 CreateContextsOffset;
+	__le32 CreateContextsLength;
+	__u8   Buffer[0];
+} __packed;
+
+struct smb2_create_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 89 */
+	__u8   OplockLevel;
+	__u8   Reserved;
+	__le32 CreateAction;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 AllocationSize;
+	__le64 EndofFile;
+	__le32 FileAttributes;
+	__le32 Reserved2;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 CreateContextsOffset;
+	__le32 CreateContextsLength;
+	__u8   Buffer[1];
+} __packed;
+
+struct create_context {
+	__le32 Next;
+	__le16 NameOffset;
+	__le16 NameLength;
+	__le16 Reserved;
+	__le16 DataOffset;
+	__le32 DataLength;
+	__u8 Buffer[0];
+} __packed;
+
+#define SMB2_LEASE_READ_CACHING_HE	0x01
+#define SMB2_LEASE_HANDLE_CACHING_HE	0x02
+#define SMB2_LEASE_WRITE_CACHING_HE	0x04
+
+#define SMB2_LEASE_NONE			__constant_cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING		__constant_cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING	__constant_cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING	__constant_cpu_to_le32(0x04)
+
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02)
+
+#define SMB2_LEASE_KEY_SIZE 16
+
+struct lease_context {
+	__le64 LeaseKeyLow;
+	__le64 LeaseKeyHigh;
+	__le32 LeaseState;
+	__le32 LeaseFlags;
+	__le64 LeaseDuration;
+} __packed;
+
+struct lease_context_v2 {
+	__le64 LeaseKeyLow;
+	__le64 LeaseKeyHigh;
+	__le32 LeaseState;
+	__le32 LeaseFlags;
+	__le64 LeaseDuration;
+	__le64 ParentLeaseKeyLow;
+	__le64 ParentLeaseKeyHigh;
+	__le16 Epoch;
+	__le16 Reserved;
+} __packed;
+
+struct create_lease {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct lease_context lcontext;
+} __packed;
+
+struct create_lease_v2 {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct lease_context_v2 lcontext;
+	__u8   Pad[4];
+} __packed;
+
+struct create_durable {
+	struct create_context ccontext;
+	__u8   Name[8];
+	union {
+		__u8  Reserved[16];
+		struct {
+			__u64 PersistentFileId;
+			__u64 VolatileFileId;
+		} Fid;
+	} Data;
+} __packed;
+
+#define COPY_CHUNK_RES_KEY_SIZE	24
+struct resume_key_req {
+	char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
+	__le32	ContextLength;	/* MBZ */
+	char	Context[0];	/* ignored, Windows sets to 4 bytes of zero */
+} __packed;
+
+/* this goes in the ioctl buffer when doing a copychunk request */
+struct copychunk_ioctl {
+	char SourceKey[COPY_CHUNK_RES_KEY_SIZE];
+	__le32 ChunkCount; /* we are only sending 1 */
+	__le32 Reserved;
+	/* array will only be one chunk long for us */
+	__le64 SourceOffset;
+	__le64 TargetOffset;
+	__le32 Length; /* how many bytes to copy */
+	__u32 Reserved2;
+} __packed;
+
+struct copychunk_ioctl_rsp {
+	__le32 ChunksWritten;
+	__le32 ChunkBytesWritten;
+	__le32 TotalBytesWritten;
+} __packed;
+
+struct validate_negotiate_info_req {
+	__le32 Capabilities;
+	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
+	__le16 SecurityMode;
+	__le16 DialectCount;
+	__le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+	__le32 Capabilities;
+	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
+	__le16 SecurityMode;
+	__le16 Dialect; /* Dialect in use for the connection */
+} __packed;
+
+#define RSS_CAPABLE	0x00000001
+#define RDMA_CAPABLE	0x00000002
+
+struct network_interface_info_ioctl_rsp {
+	__le32 Next; /* next interface. zero if this is last one */
+	__le32 IfIndex;
+	__le32 Capability; /* RSS or RDMA Capable */
+	__le32 Reserved;
+	__le64 LinkSpeed;
+	char	SockAddr_Storage[128];
+} __packed;
+
+#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+
+struct compress_ioctl {
+	__le16 CompressionState; /* See cifspdu.h for possible flag values */
+} __packed;
+
+struct smb2_ioctl_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u16 Reserved;
+	__le32 CtlCode;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 InputOffset;
+	__le32 InputCount;
+	__le32 MaxInputResponse;
+	__le32 OutputOffset;
+	__le32 OutputCount;
+	__le32 MaxOutputResponse;
+	__le32 Flags;
+	__u32  Reserved2;
+	__u8   Buffer[0];
+} __packed;
+
+struct smb2_ioctl_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u16 Reserved;
+	__le32 CtlCode;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 InputOffset;
+	__le32 InputCount;
+	__le32 OutputOffset;
+	__le32 OutputCount;
+	__le32 Flags;
+	__u32  Reserved2;
+	/* char * buffer[] */
+} __packed;
+
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB	cpu_to_le16(0x0001)
+struct smb2_close_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 24 */
+	__le16 Flags;
+	__le32 Reserved;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+} __packed;
+
+struct smb2_close_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* 60 */
+	__le16 Flags;
+	__le32 Reserved;
+	__le64 CreationTime;
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le64 AllocationSize;	/* Beginning of FILE_STANDARD_INFO equivalent */
+	__le64 EndOfFile;
+	__le32 Attributes;
+} __packed;
+
+struct smb2_flush_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 24 */
+	__le16 Reserved1;
+	__le32 Reserved2;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+} __packed;
+
+struct smb2_flush_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;
+	__le16 Reserved;
+} __packed;
+
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED	0x01
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE		0x00000000
+#define SMB2_CHANNEL_RDMA_V1		0x00000001 /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
+
+struct smb2_read_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 49 */
+	__u8   Padding; /* offset from start of SMB2 header to place read */
+	__u8   Flags; /* MBZ unless SMB3.02 or later */
+	__le32 Length;
+	__le64 Offset;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 MinimumCount;
+	__le32 Channel; /* MBZ except for SMB3 or later */
+	__le32 RemainingBytes;
+	__le16 ReadChannelInfoOffset; /* Reserved MBZ */
+	__le16 ReadChannelInfoLength; /* Reserved MBZ */
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_read_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 17 */
+	__u8   DataOffset;
+	__u8   Reserved;
+	__le32 DataLength;
+	__le32 DataRemaining;
+	__u32  Reserved2;
+	__u8   Buffer[1];
+} __packed;
+
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH	0x00000001	/* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED	0x00000002	/* SMB3.02 or later */
+
+struct smb2_write_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 49 */
+	__le16 DataOffset; /* offset from start of SMB2 header to write data */
+	__le32 Length;
+	__le64 Offset;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 Channel; /* Reserved MBZ */
+	__le32 RemainingBytes;
+	__le16 WriteChannelInfoOffset; /* Reserved MBZ */
+	__le16 WriteChannelInfoLength; /* Reserved MBZ */
+	__le32 Flags;
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 17 */
+	__u8   DataOffset;
+	__u8   Reserved;
+	__le32 DataLength;
+	__le32 DataRemaining;
+	__u32  Reserved2;
+	__u8   Buffer[1];
+} __packed;
+
+#define SMB2_LOCKFLAG_SHARED_LOCK	0x0001
+#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK	0x0002
+#define SMB2_LOCKFLAG_UNLOCK		0x0004
+#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY	0x0010
+
+struct smb2_lock_element {
+	__le64 Offset;
+	__le64 Length;
+	__le32 Flags;
+	__le32 Reserved;
+} __packed;
+
+struct smb2_lock_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 48 */
+	__le16 LockCount;
+	__le32 Reserved;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	/* Followed by at least one */
+	struct smb2_lock_element locks[1];
+} __packed;
+
+struct smb2_lock_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 4 */
+	__le16 Reserved;
+} __packed;
+
+struct smb2_echo_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__u16  Reserved;
+} __packed;
+
+struct smb2_echo_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 4 */
+	__u16  Reserved;
+} __packed;
+
+/* search (query_directory) Flags field */
+#define SMB2_RESTART_SCANS		0x01
+#define SMB2_RETURN_SINGLE_ENTRY	0x02
+#define SMB2_INDEX_SPECIFIED		0x04
+#define SMB2_REOPEN			0x10
+
+struct smb2_query_directory_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 33 */
+	__u8   FileInformationClass;
+	__u8   Flags;
+	__le32 FileIndex;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le16 FileNameOffset;
+	__le16 FileNameLength;
+	__le32 OutputBufferLength;
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_query_directory_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 9 */
+	__le16 OutputBufferOffset;
+	__le32 OutputBufferLength;
+	__u8   Buffer[1];
+} __packed;
+
+/* Possible InfoType values */
+#define SMB2_O_INFO_FILE	0x01
+#define SMB2_O_INFO_FILESYSTEM	0x02
+#define SMB2_O_INFO_SECURITY	0x03
+#define SMB2_O_INFO_QUOTA	0x04
+
+struct smb2_query_info_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 41 */
+	__u8   InfoType;
+	__u8   FileInfoClass;
+	__le32 OutputBufferLength;
+	__le16 InputBufferOffset;
+	__u16  Reserved;
+	__le32 InputBufferLength;
+	__le32 AdditionalInformation;
+	__le32 Flags;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_query_info_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 9 */
+	__le16 OutputBufferOffset;
+	__le32 OutputBufferLength;
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_set_info_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 33 */
+	__u8   InfoType;
+	__u8   FileInfoClass;
+	__le32 BufferLength;
+	__le16 BufferOffset;
+	__u16  Reserved;
+	__le32 AdditionalInformation;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__u8   Buffer[1];
+} __packed;
+
+struct smb2_set_info_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 2 */
+} __packed;
+
+struct smb2_oplock_break {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 24 */
+	__u8   OplockLevel;
+	__u8   Reserved;
+	__le32 Reserved2;
+	__u64  PersistentFid;
+	__u64  VolatileFid;
+} __packed;
+
+#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
+
+struct smb2_lease_break {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 44 */
+	__le16 Reserved;
+	__le32 Flags;
+	__u8   LeaseKey[16];
+	__le32 CurrentLeaseState;
+	__le32 NewLeaseState;
+	__le32 BreakReason;
+	__le32 AccessMaskHint;
+	__le32 ShareMaskHint;
+} __packed;
+
+struct smb2_lease_ack {
+	struct smb2_hdr hdr;
+	__le16 StructureSize; /* Must be 36 */
+	__le16 Reserved;
+	__le32 Flags;
+	__u8   LeaseKey[16];
+	__le32 LeaseState;
+	__le64 LeaseDuration;
+} __packed;
+
+/*
+ *	PDU infolevel structure definitions
+ *	BB consider moving to a different header
+ */
+
+/* File System Information Classes */
+#define FS_VOLUME_INFORMATION		1 /* Query */
+#define FS_LABEL_INFORMATION		2 /* Local only */
+#define FS_SIZE_INFORMATION		3 /* Query */
+#define FS_DEVICE_INFORMATION		4 /* Query */
+#define FS_ATTRIBUTE_INFORMATION	5 /* Query */
+#define FS_CONTROL_INFORMATION		6 /* Query, Set */
+#define FS_FULL_SIZE_INFORMATION	7 /* Query */
+#define FS_OBJECT_ID_INFORMATION	8 /* Query, Set */
+#define FS_DRIVER_PATH_INFORMATION	9 /* Local only */
+#define FS_VOLUME_FLAGS_INFORMATION	10 /* Local only */
+#define FS_SECTOR_SIZE_INFORMATION	11 /* SMB3 or later. Query */
+
+struct smb2_fs_full_size_info {
+	__le64 TotalAllocationUnits;
+	__le64 CallerAvailableAllocationUnits;
+	__le64 ActualAvailableAllocationUnits;
+	__le32 SectorsPerAllocationUnit;
+	__le32 BytesPerSector;
+} __packed;
+
+#define SSINFO_FLAGS_ALIGNED_DEVICE		0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY		0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED		0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+	__le32 LogicalBytesPerSector;
+	__le32 PhysicalBytesPerSectorForAtomicity;
+	__le32 PhysicalBytesPerSectorForPerf;
+	__le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
+	__le32 Flags;
+	__le32 ByteOffsetForSectorAlignment;
+	__le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
+/* partial list of QUERY INFO levels */
+#define FILE_DIRECTORY_INFORMATION	1
+#define FILE_FULL_DIRECTORY_INFORMATION 2
+#define FILE_BOTH_DIRECTORY_INFORMATION 3
+#define FILE_BASIC_INFORMATION		4
+#define FILE_STANDARD_INFORMATION	5
+#define FILE_INTERNAL_INFORMATION	6
+#define FILE_EA_INFORMATION	        7
+#define FILE_ACCESS_INFORMATION		8
+#define FILE_NAME_INFORMATION		9
+#define FILE_RENAME_INFORMATION		10
+#define FILE_LINK_INFORMATION		11
+#define FILE_NAMES_INFORMATION		12
+#define FILE_DISPOSITION_INFORMATION	13
+#define FILE_POSITION_INFORMATION	14
+#define FILE_FULL_EA_INFORMATION	15
+#define FILE_MODE_INFORMATION		16
+#define FILE_ALIGNMENT_INFORMATION	17
+#define FILE_ALL_INFORMATION		18
+#define FILE_ALLOCATION_INFORMATION	19
+#define FILE_END_OF_FILE_INFORMATION	20
+#define FILE_ALTERNATE_NAME_INFORMATION 21
+#define FILE_STREAM_INFORMATION		22
+#define FILE_PIPE_INFORMATION		23
+#define FILE_PIPE_LOCAL_INFORMATION	24
+#define FILE_PIPE_REMOTE_INFORMATION	25
+#define FILE_MAILSLOT_QUERY_INFORMATION 26
+#define FILE_MAILSLOT_SET_INFORMATION	27
+#define FILE_COMPRESSION_INFORMATION	28
+#define FILE_OBJECT_ID_INFORMATION	29
+/* Number 30 not defined in documents */
+#define FILE_MOVE_CLUSTER_INFORMATION	31
+#define FILE_QUOTA_INFORMATION		32
+#define FILE_REPARSE_POINT_INFORMATION	33
+#define FILE_NETWORK_OPEN_INFORMATION	34
+#define FILE_ATTRIBUTE_TAG_INFORMATION	35
+#define FILE_TRACKING_INFORMATION	36
+#define FILEID_BOTH_DIRECTORY_INFORMATION 37
+#define FILEID_FULL_DIRECTORY_INFORMATION 38
+#define FILE_VALID_DATA_LENGTH_INFORMATION 39
+#define FILE_SHORT_NAME_INFORMATION	40
+#define FILE_SFIO_RESERVE_INFORMATION	44
+#define FILE_SFIO_VOLUME_INFORMATION	45
+#define FILE_HARD_LINK_INFORMATION	46
+#define FILE_NORMALIZED_NAME_INFORMATION 48
+#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
+#define FILE_STANDARD_LINK_INFORMATION	54
+
+struct smb2_file_internal_info {
+	__le64 IndexNumber;
+} __packed; /* level 6 Query */
+
+struct smb2_file_rename_info { /* encoding of request for level 10 */
+	__u8   ReplaceIfExists; /* 1 = replace existing target with new */
+				/* 0 = fail if target already exists */
+	__u8   Reserved[7];
+	__u64  RootDirectory;  /* MBZ for network operations (why says spec?) */
+	__le32 FileNameLength;
+	char   FileName[0];     /* New name to be assigned */
+} __packed; /* level 10 Set */
+
+struct smb2_file_link_info { /* encoding of request for level 11 */
+	__u8   ReplaceIfExists; /* 1 = replace existing link with new */
+				/* 0 = fail if link already exists */
+	__u8   Reserved[7];
+	__u64  RootDirectory;  /* MBZ for network operations (why says spec?) */
+	__le32 FileNameLength;
+	char   FileName[0];     /* Name to be assigned to new link */
+} __packed; /* level 11 Set */
+
+/*
+ * This level 18, although with struct with same name is different from cifs
+ * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
+ * CurrentByteOffset.
+ */
+struct smb2_file_all_info { /* data block encoding of response to level 18 */
+	__le64 CreationTime;	/* Beginning of FILE_BASIC_INFO equivalent */
+	__le64 LastAccessTime;
+	__le64 LastWriteTime;
+	__le64 ChangeTime;
+	__le32 Attributes;
+	__u32  Pad1;		/* End of FILE_BASIC_INFO_INFO equivalent */
+	__le64 AllocationSize;	/* Beginning of FILE_STANDARD_INFO equivalent */
+	__le64 EndOfFile;	/* size ie offset to first free byte in file */
+	__le32 NumberOfLinks;	/* hard links */
+	__u8   DeletePending;
+	__u8   Directory;
+	__u16  Pad2;		/* End of FILE_STANDARD_INFO equivalent */
+	__le64 IndexNumber;
+	__le32 EASize;
+	__le32 AccessFlags;
+	__le64 CurrentByteOffset;
+	__le32 Mode;
+	__le32 AlignmentRequirement;
+	__le32 FileNameLength;
+	char   FileName[1];
+} __packed; /* level 18 Query */
+
+struct smb2_file_eof_info { /* encoding of request for level 10 */
+	__le64 EndOfFile; /* new end of file value */
+} __packed; /* level 20 Set */
+
+#endif				/* _SMB2PDU_H */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
new file mode 100644
index 00000000000..0ce48db20a6
--- /dev/null
+++ b/fs/cifs/smb2proto.h
@@ -0,0 +1,168 @@
+/*
+ *   fs/cifs/smb2proto.h
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _SMB2PROTO_H
+#define _SMB2PROTO_H
+#include <linux/nls.h>
+#include <linux/key-type.h>
+
+struct statfs;
+struct smb_rqst;
+
+/*
+ *****************************************************************
+ * All Prototypes
+ *****************************************************************
+ */
+extern int map_smb2_to_linux_error(char *buf, bool log_err);
+extern int smb2_check_message(char *buf, unsigned int length);
+extern unsigned int smb2_calc_size(void *buf);
+extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
+extern __le16 *cifs_convert_path_to_utf16(const char *from,
+					  struct cifs_sb_info *cifs_sb);
+
+extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *);
+extern int smb2_check_receive(struct mid_q_entry *mid,
+			      struct TCP_Server_Info *server, bool log_error);
+extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
+			      struct smb_rqst *rqst);
+extern struct mid_q_entry *smb2_setup_async_request(
+			struct TCP_Server_Info *server, struct smb_rqst *rqst);
+extern int smb2_calc_signature(struct smb_rqst *rqst,
+				struct TCP_Server_Info *server);
+extern int smb3_calc_signature(struct smb_rqst *rqst,
+				struct TCP_Server_Info *server);
+extern void smb2_echo_request(struct work_struct *work);
+extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
+extern bool smb2_is_valid_oplock_break(char *buffer,
+				       struct TCP_Server_Info *srv);
+
+extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
+				   struct smb2_file_all_info *src);
+extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+				struct cifs_sb_info *cifs_sb,
+				const char *full_path, FILE_ALL_INFO *data,
+				bool *adjust_tz, bool *symlink);
+extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
+			      const char *full_path, __u64 size,
+			      struct cifs_sb_info *cifs_sb, bool set_alloc);
+extern int smb2_set_file_info(struct inode *inode, const char *full_path,
+			      FILE_BASIC_INFO *buf, const unsigned int xid);
+extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *name, struct cifs_sb_info *cifs_sb);
+extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
+			       struct cifs_sb_info *cifs_sb,
+			       struct cifs_tcon *tcon, const unsigned int xid);
+extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+		      const char *name, struct cifs_sb_info *cifs_sb);
+extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
+		       const char *name, struct cifs_sb_info *cifs_sb);
+extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
+			    const char *from_name, const char *to_name,
+			    struct cifs_sb_info *cifs_sb);
+extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+				const char *from_name, const char *to_name,
+				struct cifs_sb_info *cifs_sb);
+
+extern int smb2_open_file(const unsigned int xid,
+			  struct cifs_open_parms *oparms,
+			  __u32 *oplock, FILE_ALL_INFO *buf);
+extern int smb2_unlock_range(struct cifsFileInfo *cfile,
+			     struct file_lock *flock, const unsigned int xid);
+extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
+
+/*
+ * SMB2 Worker functions - most of protocol specific implementation details
+ * are contained within these calls.
+ */
+extern int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses);
+extern int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
+			   const struct nls_table *nls_cp);
+extern int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses);
+extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
+		     const char *tree, struct cifs_tcon *tcon,
+		     const struct nls_table *);
+extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
+extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
+		     __le16 *path, __u8 *oplock,
+		     struct smb2_file_all_info *buf,
+		     struct smb2_err_rsp **err_buf);
+extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid, u32 opcode,
+		     bool is_fsctl, char *in_data, u32 indatalen,
+		     char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+		      u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
+		      u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
+			   u64 persistent_file_id, u64 volatile_file_id,
+			   struct smb2_file_all_info *data);
+extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
+			    u64 persistent_fid, u64 volatile_fid,
+			    __le64 *uniqueid);
+extern int smb2_async_readv(struct cifs_readdata *rdata);
+extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
+		     unsigned int *nbytes, char **buf, int *buf_type);
+extern int smb2_async_writev(struct cifs_writedata *wdata,
+			     void (*release)(struct kref *kref));
+extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
+		      unsigned int *nbytes, struct kvec *iov, int n_vec);
+extern int SMB2_echo(struct TCP_Server_Info *server);
+extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+				u64 persistent_fid, u64 volatile_fid, int index,
+				struct cifs_search_info *srch_inf);
+extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
+		       u64 persistent_fid, u64 volatile_fid,
+		       __le16 *target_file);
+extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
+			     u64 persistent_fid, u64 volatile_fid,
+			     __le16 *target_file);
+extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
+			u64 persistent_fid, u64 volatile_fid, u32 pid,
+			__le64 *eof);
+extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_fid, u64 volatile_fid,
+			 FILE_BASIC_INFO *buf);
+extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+				u64 persistent_fid, u64 volatile_fid);
+extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
+			     const u64 persistent_fid, const u64 volatile_fid,
+			     const __u8 oplock_level);
+extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_file_id, u64 volatile_file_id,
+			 struct kstatfs *FSData);
+extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_file_id, u64 volatile_file_id, int lvl);
+extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
+		     const __u64 persist_fid, const __u64 volatile_fid,
+		     const __u32 pid, const __u64 length, const __u64 offset,
+		     const __u32 lockFlags, const bool wait);
+extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
+		      const __u64 persist_fid, const __u64 volatile_fid,
+		      const __u32 pid, const __u32 num_lock,
+		      struct smb2_lock_element *buf);
+extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
+			    __u8 *lease_key, const __le32 lease_state);
+extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
+
+#endif			/* _SMB2PROTO_H */
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
new file mode 100644
index 00000000000..3d5f62150de
--- /dev/null
+++ b/fs/cifs/smb2status.h
@@ -0,0 +1,1782 @@
+/*
+ *   fs/cifs/smb2status.h
+ *
+ *   SMB2 Status code (network error) definitions
+ *   Definitions are from MS-ERREF
+ *
+ *   Copyright (c) International Business Machines  Corp., 2009,2011
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ *  0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ *  SEV C N <-------Facility--------> <------Error Status Code------>
+ *
+ *  C is set if "customer defined" error, N bit is reserved and MBZ
+ */
+
+#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
+#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+
+struct ntstatus {
+	/* Facility is the high 12 bits of the following field */
+	__le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */
+	__le32 Code;
+};
+
+#define STATUS_SUCCESS __constant_cpu_to_le32(0x00000000)
+#define STATUS_WAIT_0 __constant_cpu_to_le32(0x00000000)
+#define STATUS_WAIT_1 __constant_cpu_to_le32(0x00000001)
+#define STATUS_WAIT_2 __constant_cpu_to_le32(0x00000002)
+#define STATUS_WAIT_3 __constant_cpu_to_le32(0x00000003)
+#define STATUS_WAIT_63 __constant_cpu_to_le32(0x0000003F)
+#define STATUS_ABANDONED __constant_cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_0 __constant_cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_63 __constant_cpu_to_le32(0x000000BF)
+#define STATUS_USER_APC __constant_cpu_to_le32(0x000000C0)
+#define STATUS_KERNEL_APC __constant_cpu_to_le32(0x00000100)
+#define STATUS_ALERTED __constant_cpu_to_le32(0x00000101)
+#define STATUS_TIMEOUT __constant_cpu_to_le32(0x00000102)
+#define STATUS_PENDING __constant_cpu_to_le32(0x00000103)
+#define STATUS_REPARSE __constant_cpu_to_le32(0x00000104)
+#define STATUS_MORE_ENTRIES __constant_cpu_to_le32(0x00000105)
+#define STATUS_NOT_ALL_ASSIGNED __constant_cpu_to_le32(0x00000106)
+#define STATUS_SOME_NOT_MAPPED __constant_cpu_to_le32(0x00000107)
+#define STATUS_OPLOCK_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x00000108)
+#define STATUS_VOLUME_MOUNTED __constant_cpu_to_le32(0x00000109)
+#define STATUS_RXACT_COMMITTED __constant_cpu_to_le32(0x0000010A)
+#define STATUS_NOTIFY_CLEANUP __constant_cpu_to_le32(0x0000010B)
+#define STATUS_NOTIFY_ENUM_DIR __constant_cpu_to_le32(0x0000010C)
+#define STATUS_NO_QUOTAS_FOR_ACCOUNT __constant_cpu_to_le32(0x0000010D)
+#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED __constant_cpu_to_le32(0x0000010E)
+#define STATUS_PAGE_FAULT_TRANSITION __constant_cpu_to_le32(0x00000110)
+#define STATUS_PAGE_FAULT_DEMAND_ZERO __constant_cpu_to_le32(0x00000111)
+#define STATUS_PAGE_FAULT_COPY_ON_WRITE __constant_cpu_to_le32(0x00000112)
+#define STATUS_PAGE_FAULT_GUARD_PAGE __constant_cpu_to_le32(0x00000113)
+#define STATUS_PAGE_FAULT_PAGING_FILE __constant_cpu_to_le32(0x00000114)
+#define STATUS_CACHE_PAGE_LOCKED __constant_cpu_to_le32(0x00000115)
+#define STATUS_CRASH_DUMP __constant_cpu_to_le32(0x00000116)
+#define STATUS_BUFFER_ALL_ZEROS __constant_cpu_to_le32(0x00000117)
+#define STATUS_REPARSE_OBJECT __constant_cpu_to_le32(0x00000118)
+#define STATUS_RESOURCE_REQUIREMENTS_CHANGED __constant_cpu_to_le32(0x00000119)
+#define STATUS_TRANSLATION_COMPLETE __constant_cpu_to_le32(0x00000120)
+#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY __constant_cpu_to_le32(0x00000121)
+#define STATUS_NOTHING_TO_TERMINATE __constant_cpu_to_le32(0x00000122)
+#define STATUS_PROCESS_NOT_IN_JOB __constant_cpu_to_le32(0x00000123)
+#define STATUS_PROCESS_IN_JOB __constant_cpu_to_le32(0x00000124)
+#define STATUS_VOLSNAP_HIBERNATE_READY __constant_cpu_to_le32(0x00000125)
+#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY __constant_cpu_to_le32(0x00000126)
+#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED __constant_cpu_to_le32(0x00000127)
+#define STATUS_INTERRUPT_STILL_CONNECTED __constant_cpu_to_le32(0x00000128)
+#define STATUS_PROCESS_CLONED __constant_cpu_to_le32(0x00000129)
+#define STATUS_FILE_LOCKED_WITH_ONLY_READERS __constant_cpu_to_le32(0x0000012A)
+#define STATUS_FILE_LOCKED_WITH_WRITERS __constant_cpu_to_le32(0x0000012B)
+#define STATUS_RESOURCEMANAGER_READ_ONLY __constant_cpu_to_le32(0x00000202)
+#define STATUS_WAIT_FOR_OPLOCK __constant_cpu_to_le32(0x00000367)
+#define DBG_EXCEPTION_HANDLED __constant_cpu_to_le32(0x00010001)
+#define DBG_CONTINUE __constant_cpu_to_le32(0x00010002)
+#define STATUS_FLT_IO_COMPLETE __constant_cpu_to_le32(0x001C0001)
+#define STATUS_OBJECT_NAME_EXISTS __constant_cpu_to_le32(0x40000000)
+#define STATUS_THREAD_WAS_SUSPENDED __constant_cpu_to_le32(0x40000001)
+#define STATUS_WORKING_SET_LIMIT_RANGE __constant_cpu_to_le32(0x40000002)
+#define STATUS_IMAGE_NOT_AT_BASE __constant_cpu_to_le32(0x40000003)
+#define STATUS_RXACT_STATE_CREATED __constant_cpu_to_le32(0x40000004)
+#define STATUS_SEGMENT_NOTIFICATION __constant_cpu_to_le32(0x40000005)
+#define STATUS_LOCAL_USER_SESSION_KEY __constant_cpu_to_le32(0x40000006)
+#define STATUS_BAD_CURRENT_DIRECTORY __constant_cpu_to_le32(0x40000007)
+#define STATUS_SERIAL_MORE_WRITES __constant_cpu_to_le32(0x40000008)
+#define STATUS_REGISTRY_RECOVERED __constant_cpu_to_le32(0x40000009)
+#define STATUS_FT_READ_RECOVERY_FROM_BACKUP __constant_cpu_to_le32(0x4000000A)
+#define STATUS_FT_WRITE_RECOVERY __constant_cpu_to_le32(0x4000000B)
+#define STATUS_SERIAL_COUNTER_TIMEOUT __constant_cpu_to_le32(0x4000000C)
+#define STATUS_NULL_LM_PASSWORD __constant_cpu_to_le32(0x4000000D)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH __constant_cpu_to_le32(0x4000000E)
+#define STATUS_RECEIVE_PARTIAL __constant_cpu_to_le32(0x4000000F)
+#define STATUS_RECEIVE_EXPEDITED __constant_cpu_to_le32(0x40000010)
+#define STATUS_RECEIVE_PARTIAL_EXPEDITED __constant_cpu_to_le32(0x40000011)
+#define STATUS_EVENT_DONE __constant_cpu_to_le32(0x40000012)
+#define STATUS_EVENT_PENDING __constant_cpu_to_le32(0x40000013)
+#define STATUS_CHECKING_FILE_SYSTEM __constant_cpu_to_le32(0x40000014)
+#define STATUS_FATAL_APP_EXIT __constant_cpu_to_le32(0x40000015)
+#define STATUS_PREDEFINED_HANDLE __constant_cpu_to_le32(0x40000016)
+#define STATUS_WAS_UNLOCKED __constant_cpu_to_le32(0x40000017)
+#define STATUS_SERVICE_NOTIFICATION __constant_cpu_to_le32(0x40000018)
+#define STATUS_WAS_LOCKED __constant_cpu_to_le32(0x40000019)
+#define STATUS_LOG_HARD_ERROR __constant_cpu_to_le32(0x4000001A)
+#define STATUS_ALREADY_WIN32 __constant_cpu_to_le32(0x4000001B)
+#define STATUS_WX86_UNSIMULATE __constant_cpu_to_le32(0x4000001C)
+#define STATUS_WX86_CONTINUE __constant_cpu_to_le32(0x4000001D)
+#define STATUS_WX86_SINGLE_STEP __constant_cpu_to_le32(0x4000001E)
+#define STATUS_WX86_BREAKPOINT __constant_cpu_to_le32(0x4000001F)
+#define STATUS_WX86_EXCEPTION_CONTINUE __constant_cpu_to_le32(0x40000020)
+#define STATUS_WX86_EXCEPTION_LASTCHANCE __constant_cpu_to_le32(0x40000021)
+#define STATUS_WX86_EXCEPTION_CHAIN __constant_cpu_to_le32(0x40000022)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE __constant_cpu_to_le32(0x40000023)
+#define STATUS_NO_YIELD_PERFORMED __constant_cpu_to_le32(0x40000024)
+#define STATUS_TIMER_RESUME_IGNORED __constant_cpu_to_le32(0x40000025)
+#define STATUS_ARBITRATION_UNHANDLED __constant_cpu_to_le32(0x40000026)
+#define STATUS_CARDBUS_NOT_SUPPORTED __constant_cpu_to_le32(0x40000027)
+#define STATUS_WX86_CREATEWX86TIB __constant_cpu_to_le32(0x40000028)
+#define STATUS_MP_PROCESSOR_MISMATCH __constant_cpu_to_le32(0x40000029)
+#define STATUS_HIBERNATED __constant_cpu_to_le32(0x4000002A)
+#define STATUS_RESUME_HIBERNATION __constant_cpu_to_le32(0x4000002B)
+#define STATUS_FIRMWARE_UPDATED __constant_cpu_to_le32(0x4000002C)
+#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES __constant_cpu_to_le32(0x4000002D)
+#define STATUS_MESSAGE_RETRIEVED __constant_cpu_to_le32(0x4000002E)
+#define STATUS_SYSTEM_POWERSTATE_TRANSITION __constant_cpu_to_le32(0x4000002F)
+#define STATUS_ALPC_CHECK_COMPLETION_LIST __constant_cpu_to_le32(0x40000030)
+#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION __constant_cpu_to_le32(0x40000031)
+#define STATUS_ACCESS_AUDIT_BY_POLICY __constant_cpu_to_le32(0x40000032)
+#define STATUS_ABANDON_HIBERFILE __constant_cpu_to_le32(0x40000033)
+#define STATUS_BIZRULES_NOT_ENABLED __constant_cpu_to_le32(0x40000034)
+#define STATUS_WAKE_SYSTEM __constant_cpu_to_le32(0x40000294)
+#define STATUS_DS_SHUTTING_DOWN __constant_cpu_to_le32(0x40000370)
+#define DBG_REPLY_LATER __constant_cpu_to_le32(0x40010001)
+#define DBG_UNABLE_TO_PROVIDE_HANDLE __constant_cpu_to_le32(0x40010002)
+#define DBG_TERMINATE_THREAD __constant_cpu_to_le32(0x40010003)
+#define DBG_TERMINATE_PROCESS __constant_cpu_to_le32(0x40010004)
+#define DBG_CONTROL_C __constant_cpu_to_le32(0x40010005)
+#define DBG_PRINTEXCEPTION_C __constant_cpu_to_le32(0x40010006)
+#define DBG_RIPEXCEPTION __constant_cpu_to_le32(0x40010007)
+#define DBG_CONTROL_BREAK __constant_cpu_to_le32(0x40010008)
+#define DBG_COMMAND_EXCEPTION __constant_cpu_to_le32(0x40010009)
+#define RPC_NT_UUID_LOCAL_ONLY __constant_cpu_to_le32(0x40020056)
+#define RPC_NT_SEND_INCOMPLETE __constant_cpu_to_le32(0x400200AF)
+#define STATUS_CTX_CDM_CONNECT __constant_cpu_to_le32(0x400A0004)
+#define STATUS_CTX_CDM_DISCONNECT __constant_cpu_to_le32(0x400A0005)
+#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT __constant_cpu_to_le32(0x4015000D)
+#define STATUS_RECOVERY_NOT_NEEDED __constant_cpu_to_le32(0x40190034)
+#define STATUS_RM_ALREADY_STARTED __constant_cpu_to_le32(0x40190035)
+#define STATUS_LOG_NO_RESTART __constant_cpu_to_le32(0x401A000C)
+#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST __constant_cpu_to_le32(0x401B00EC)
+#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED __constant_cpu_to_le32(0x401E000A)
+#define STATUS_GRAPHICS_DRIVER_MISMATCH __constant_cpu_to_le32(0x401E0117)
+#define STATUS_GRAPHICS_MODE_NOT_PINNED __constant_cpu_to_le32(0x401E0307)
+#define STATUS_GRAPHICS_NO_PREFERRED_MODE __constant_cpu_to_le32(0x401E031E)
+#define STATUS_GRAPHICS_DATASET_IS_EMPTY __constant_cpu_to_le32(0x401E034B)
+#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET __constant_cpu_to_le32(0x401E034C)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED __constant_cpu_to_le32(0x401E0351)
+#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS __constant_cpu_to_le32(0x401E042F)
+#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED __constant_cpu_to_le32(0x401E0437)
+#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY __constant_cpu_to_le32(0x401E0439)
+#define STATUS_GRAPHICS_START_DEFERRED __constant_cpu_to_le32(0x401E043A)
+#define STATUS_NDIS_INDICATION_REQUIRED __constant_cpu_to_le32(0x40230001)
+#define STATUS_GUARD_PAGE_VIOLATION __constant_cpu_to_le32(0x80000001)
+#define STATUS_DATATYPE_MISALIGNMENT __constant_cpu_to_le32(0x80000002)
+#define STATUS_BREAKPOINT __constant_cpu_to_le32(0x80000003)
+#define STATUS_SINGLE_STEP __constant_cpu_to_le32(0x80000004)
+#define STATUS_BUFFER_OVERFLOW __constant_cpu_to_le32(0x80000005)
+#define STATUS_NO_MORE_FILES __constant_cpu_to_le32(0x80000006)
+#define STATUS_WAKE_SYSTEM_DEBUGGER __constant_cpu_to_le32(0x80000007)
+#define STATUS_HANDLES_CLOSED __constant_cpu_to_le32(0x8000000A)
+#define STATUS_NO_INHERITANCE __constant_cpu_to_le32(0x8000000B)
+#define STATUS_GUID_SUBSTITUTION_MADE __constant_cpu_to_le32(0x8000000C)
+#define STATUS_PARTIAL_COPY __constant_cpu_to_le32(0x8000000D)
+#define STATUS_DEVICE_PAPER_EMPTY __constant_cpu_to_le32(0x8000000E)
+#define STATUS_DEVICE_POWERED_OFF __constant_cpu_to_le32(0x8000000F)
+#define STATUS_DEVICE_OFF_LINE __constant_cpu_to_le32(0x80000010)
+#define STATUS_DEVICE_BUSY __constant_cpu_to_le32(0x80000011)
+#define STATUS_NO_MORE_EAS __constant_cpu_to_le32(0x80000012)
+#define STATUS_INVALID_EA_NAME __constant_cpu_to_le32(0x80000013)
+#define STATUS_EA_LIST_INCONSISTENT __constant_cpu_to_le32(0x80000014)
+#define STATUS_INVALID_EA_FLAG __constant_cpu_to_le32(0x80000015)
+#define STATUS_VERIFY_REQUIRED __constant_cpu_to_le32(0x80000016)
+#define STATUS_EXTRANEOUS_INFORMATION __constant_cpu_to_le32(0x80000017)
+#define STATUS_RXACT_COMMIT_NECESSARY __constant_cpu_to_le32(0x80000018)
+#define STATUS_NO_MORE_ENTRIES __constant_cpu_to_le32(0x8000001A)
+#define STATUS_FILEMARK_DETECTED __constant_cpu_to_le32(0x8000001B)
+#define STATUS_MEDIA_CHANGED __constant_cpu_to_le32(0x8000001C)
+#define STATUS_BUS_RESET __constant_cpu_to_le32(0x8000001D)
+#define STATUS_END_OF_MEDIA __constant_cpu_to_le32(0x8000001E)
+#define STATUS_BEGINNING_OF_MEDIA __constant_cpu_to_le32(0x8000001F)
+#define STATUS_MEDIA_CHECK __constant_cpu_to_le32(0x80000020)
+#define STATUS_SETMARK_DETECTED __constant_cpu_to_le32(0x80000021)
+#define STATUS_NO_DATA_DETECTED __constant_cpu_to_le32(0x80000022)
+#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000023)
+#define STATUS_SERVER_HAS_OPEN_HANDLES __constant_cpu_to_le32(0x80000024)
+#define STATUS_ALREADY_DISCONNECTED __constant_cpu_to_le32(0x80000025)
+#define STATUS_LONGJUMP __constant_cpu_to_le32(0x80000026)
+#define STATUS_CLEANER_CARTRIDGE_INSTALLED __constant_cpu_to_le32(0x80000027)
+#define STATUS_PLUGPLAY_QUERY_VETOED __constant_cpu_to_le32(0x80000028)
+#define STATUS_UNWIND_CONSOLIDATE __constant_cpu_to_le32(0x80000029)
+#define STATUS_REGISTRY_HIVE_RECOVERED __constant_cpu_to_le32(0x8000002A)
+#define STATUS_DLL_MIGHT_BE_INSECURE __constant_cpu_to_le32(0x8000002B)
+#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE __constant_cpu_to_le32(0x8000002C)
+#define STATUS_STOPPED_ON_SYMLINK __constant_cpu_to_le32(0x8000002D)
+#define STATUS_DEVICE_REQUIRES_CLEANING __constant_cpu_to_le32(0x80000288)
+#define STATUS_DEVICE_DOOR_OPEN __constant_cpu_to_le32(0x80000289)
+#define STATUS_DATA_LOST_REPAIR __constant_cpu_to_le32(0x80000803)
+#define DBG_EXCEPTION_NOT_HANDLED __constant_cpu_to_le32(0x80010001)
+#define STATUS_CLUSTER_NODE_ALREADY_UP __constant_cpu_to_le32(0x80130001)
+#define STATUS_CLUSTER_NODE_ALREADY_DOWN __constant_cpu_to_le32(0x80130002)
+#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE __constant_cpu_to_le32(0x80130003)
+#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE __constant_cpu_to_le32(0x80130004)
+#define STATUS_CLUSTER_NODE_ALREADY_MEMBER __constant_cpu_to_le32(0x80130005)
+#define STATUS_COULD_NOT_RESIZE_LOG __constant_cpu_to_le32(0x80190009)
+#define STATUS_NO_TXF_METADATA __constant_cpu_to_le32(0x80190029)
+#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN __constant_cpu_to_le32(0x80190031)
+#define STATUS_TXF_METADATA_ALREADY_PRESENT __constant_cpu_to_le32(0x80190041)
+#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET __constant_cpu_to_le32(0x80190042)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED __constant_cpu_to_le32(0x801B00EB)
+#define STATUS_FLT_BUFFER_TOO_SMALL __constant_cpu_to_le32(0x801C0001)
+#define STATUS_FVE_PARTIAL_METADATA __constant_cpu_to_le32(0x80210001)
+#define STATUS_UNSUCCESSFUL __constant_cpu_to_le32(0xC0000001)
+#define STATUS_NOT_IMPLEMENTED __constant_cpu_to_le32(0xC0000002)
+#define STATUS_INVALID_INFO_CLASS __constant_cpu_to_le32(0xC0000003)
+#define STATUS_INFO_LENGTH_MISMATCH __constant_cpu_to_le32(0xC0000004)
+#define STATUS_ACCESS_VIOLATION __constant_cpu_to_le32(0xC0000005)
+#define STATUS_IN_PAGE_ERROR __constant_cpu_to_le32(0xC0000006)
+#define STATUS_PAGEFILE_QUOTA __constant_cpu_to_le32(0xC0000007)
+#define STATUS_INVALID_HANDLE __constant_cpu_to_le32(0xC0000008)
+#define STATUS_BAD_INITIAL_STACK __constant_cpu_to_le32(0xC0000009)
+#define STATUS_BAD_INITIAL_PC __constant_cpu_to_le32(0xC000000A)
+#define STATUS_INVALID_CID __constant_cpu_to_le32(0xC000000B)
+#define STATUS_TIMER_NOT_CANCELED __constant_cpu_to_le32(0xC000000C)
+#define STATUS_INVALID_PARAMETER __constant_cpu_to_le32(0xC000000D)
+#define STATUS_NO_SUCH_DEVICE __constant_cpu_to_le32(0xC000000E)
+#define STATUS_NO_SUCH_FILE __constant_cpu_to_le32(0xC000000F)
+#define STATUS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0000010)
+#define STATUS_END_OF_FILE __constant_cpu_to_le32(0xC0000011)
+#define STATUS_WRONG_VOLUME __constant_cpu_to_le32(0xC0000012)
+#define STATUS_NO_MEDIA_IN_DEVICE __constant_cpu_to_le32(0xC0000013)
+#define STATUS_UNRECOGNIZED_MEDIA __constant_cpu_to_le32(0xC0000014)
+#define STATUS_NONEXISTENT_SECTOR __constant_cpu_to_le32(0xC0000015)
+#define STATUS_MORE_PROCESSING_REQUIRED __constant_cpu_to_le32(0xC0000016)
+#define STATUS_NO_MEMORY __constant_cpu_to_le32(0xC0000017)
+#define STATUS_CONFLICTING_ADDRESSES __constant_cpu_to_le32(0xC0000018)
+#define STATUS_NOT_MAPPED_VIEW __constant_cpu_to_le32(0xC0000019)
+#define STATUS_UNABLE_TO_FREE_VM __constant_cpu_to_le32(0xC000001A)
+#define STATUS_UNABLE_TO_DELETE_SECTION __constant_cpu_to_le32(0xC000001B)
+#define STATUS_INVALID_SYSTEM_SERVICE __constant_cpu_to_le32(0xC000001C)
+#define STATUS_ILLEGAL_INSTRUCTION __constant_cpu_to_le32(0xC000001D)
+#define STATUS_INVALID_LOCK_SEQUENCE __constant_cpu_to_le32(0xC000001E)
+#define STATUS_INVALID_VIEW_SIZE __constant_cpu_to_le32(0xC000001F)
+#define STATUS_INVALID_FILE_FOR_SECTION __constant_cpu_to_le32(0xC0000020)
+#define STATUS_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0000021)
+#define STATUS_ACCESS_DENIED __constant_cpu_to_le32(0xC0000022)
+#define STATUS_BUFFER_TOO_SMALL __constant_cpu_to_le32(0xC0000023)
+#define STATUS_OBJECT_TYPE_MISMATCH __constant_cpu_to_le32(0xC0000024)
+#define STATUS_NONCONTINUABLE_EXCEPTION __constant_cpu_to_le32(0xC0000025)
+#define STATUS_INVALID_DISPOSITION __constant_cpu_to_le32(0xC0000026)
+#define STATUS_UNWIND __constant_cpu_to_le32(0xC0000027)
+#define STATUS_BAD_STACK __constant_cpu_to_le32(0xC0000028)
+#define STATUS_INVALID_UNWIND_TARGET __constant_cpu_to_le32(0xC0000029)
+#define STATUS_NOT_LOCKED __constant_cpu_to_le32(0xC000002A)
+#define STATUS_PARITY_ERROR __constant_cpu_to_le32(0xC000002B)
+#define STATUS_UNABLE_TO_DECOMMIT_VM __constant_cpu_to_le32(0xC000002C)
+#define STATUS_NOT_COMMITTED __constant_cpu_to_le32(0xC000002D)
+#define STATUS_INVALID_PORT_ATTRIBUTES __constant_cpu_to_le32(0xC000002E)
+#define STATUS_PORT_MESSAGE_TOO_LONG __constant_cpu_to_le32(0xC000002F)
+#define STATUS_INVALID_PARAMETER_MIX __constant_cpu_to_le32(0xC0000030)
+#define STATUS_INVALID_QUOTA_LOWER __constant_cpu_to_le32(0xC0000031)
+#define STATUS_DISK_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000032)
+#define STATUS_OBJECT_NAME_INVALID __constant_cpu_to_le32(0xC0000033)
+#define STATUS_OBJECT_NAME_NOT_FOUND __constant_cpu_to_le32(0xC0000034)
+#define STATUS_OBJECT_NAME_COLLISION __constant_cpu_to_le32(0xC0000035)
+#define STATUS_PORT_DISCONNECTED __constant_cpu_to_le32(0xC0000037)
+#define STATUS_DEVICE_ALREADY_ATTACHED __constant_cpu_to_le32(0xC0000038)
+#define STATUS_OBJECT_PATH_INVALID __constant_cpu_to_le32(0xC0000039)
+#define STATUS_OBJECT_PATH_NOT_FOUND __constant_cpu_to_le32(0xC000003A)
+#define STATUS_OBJECT_PATH_SYNTAX_BAD __constant_cpu_to_le32(0xC000003B)
+#define STATUS_DATA_OVERRUN __constant_cpu_to_le32(0xC000003C)
+#define STATUS_DATA_LATE_ERROR __constant_cpu_to_le32(0xC000003D)
+#define STATUS_DATA_ERROR __constant_cpu_to_le32(0xC000003E)
+#define STATUS_CRC_ERROR __constant_cpu_to_le32(0xC000003F)
+#define STATUS_SECTION_TOO_BIG __constant_cpu_to_le32(0xC0000040)
+#define STATUS_PORT_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000041)
+#define STATUS_INVALID_PORT_HANDLE __constant_cpu_to_le32(0xC0000042)
+#define STATUS_SHARING_VIOLATION __constant_cpu_to_le32(0xC0000043)
+#define STATUS_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000044)
+#define STATUS_INVALID_PAGE_PROTECTION __constant_cpu_to_le32(0xC0000045)
+#define STATUS_MUTANT_NOT_OWNED __constant_cpu_to_le32(0xC0000046)
+#define STATUS_SEMAPHORE_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000047)
+#define STATUS_PORT_ALREADY_SET __constant_cpu_to_le32(0xC0000048)
+#define STATUS_SECTION_NOT_IMAGE __constant_cpu_to_le32(0xC0000049)
+#define STATUS_SUSPEND_COUNT_EXCEEDED __constant_cpu_to_le32(0xC000004A)
+#define STATUS_THREAD_IS_TERMINATING __constant_cpu_to_le32(0xC000004B)
+#define STATUS_BAD_WORKING_SET_LIMIT __constant_cpu_to_le32(0xC000004C)
+#define STATUS_INCOMPATIBLE_FILE_MAP __constant_cpu_to_le32(0xC000004D)
+#define STATUS_SECTION_PROTECTION __constant_cpu_to_le32(0xC000004E)
+#define STATUS_EAS_NOT_SUPPORTED __constant_cpu_to_le32(0xC000004F)
+#define STATUS_EA_TOO_LARGE __constant_cpu_to_le32(0xC0000050)
+#define STATUS_NONEXISTENT_EA_ENTRY __constant_cpu_to_le32(0xC0000051)
+#define STATUS_NO_EAS_ON_FILE __constant_cpu_to_le32(0xC0000052)
+#define STATUS_EA_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000053)
+#define STATUS_FILE_LOCK_CONFLICT __constant_cpu_to_le32(0xC0000054)
+#define STATUS_LOCK_NOT_GRANTED __constant_cpu_to_le32(0xC0000055)
+#define STATUS_DELETE_PENDING __constant_cpu_to_le32(0xC0000056)
+#define STATUS_CTL_FILE_NOT_SUPPORTED __constant_cpu_to_le32(0xC0000057)
+#define STATUS_UNKNOWN_REVISION __constant_cpu_to_le32(0xC0000058)
+#define STATUS_REVISION_MISMATCH __constant_cpu_to_le32(0xC0000059)
+#define STATUS_INVALID_OWNER __constant_cpu_to_le32(0xC000005A)
+#define STATUS_INVALID_PRIMARY_GROUP __constant_cpu_to_le32(0xC000005B)
+#define STATUS_NO_IMPERSONATION_TOKEN __constant_cpu_to_le32(0xC000005C)
+#define STATUS_CANT_DISABLE_MANDATORY __constant_cpu_to_le32(0xC000005D)
+#define STATUS_NO_LOGON_SERVERS __constant_cpu_to_le32(0xC000005E)
+#define STATUS_NO_SUCH_LOGON_SESSION __constant_cpu_to_le32(0xC000005F)
+#define STATUS_NO_SUCH_PRIVILEGE __constant_cpu_to_le32(0xC0000060)
+#define STATUS_PRIVILEGE_NOT_HELD __constant_cpu_to_le32(0xC0000061)
+#define STATUS_INVALID_ACCOUNT_NAME __constant_cpu_to_le32(0xC0000062)
+#define STATUS_USER_EXISTS __constant_cpu_to_le32(0xC0000063)
+#define STATUS_NO_SUCH_USER __constant_cpu_to_le32(0xC0000064)
+#define STATUS_GROUP_EXISTS __constant_cpu_to_le32(0xC0000065)
+#define STATUS_NO_SUCH_GROUP __constant_cpu_to_le32(0xC0000066)
+#define STATUS_MEMBER_IN_GROUP __constant_cpu_to_le32(0xC0000067)
+#define STATUS_MEMBER_NOT_IN_GROUP __constant_cpu_to_le32(0xC0000068)
+#define STATUS_LAST_ADMIN __constant_cpu_to_le32(0xC0000069)
+#define STATUS_WRONG_PASSWORD __constant_cpu_to_le32(0xC000006A)
+#define STATUS_ILL_FORMED_PASSWORD __constant_cpu_to_le32(0xC000006B)
+#define STATUS_PASSWORD_RESTRICTION __constant_cpu_to_le32(0xC000006C)
+#define STATUS_LOGON_FAILURE __constant_cpu_to_le32(0xC000006D)
+#define STATUS_ACCOUNT_RESTRICTION __constant_cpu_to_le32(0xC000006E)
+#define STATUS_INVALID_LOGON_HOURS __constant_cpu_to_le32(0xC000006F)
+#define STATUS_INVALID_WORKSTATION __constant_cpu_to_le32(0xC0000070)
+#define STATUS_PASSWORD_EXPIRED __constant_cpu_to_le32(0xC0000071)
+#define STATUS_ACCOUNT_DISABLED __constant_cpu_to_le32(0xC0000072)
+#define STATUS_NONE_MAPPED __constant_cpu_to_le32(0xC0000073)
+#define STATUS_TOO_MANY_LUIDS_REQUESTED __constant_cpu_to_le32(0xC0000074)
+#define STATUS_LUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000075)
+#define STATUS_INVALID_SUB_AUTHORITY __constant_cpu_to_le32(0xC0000076)
+#define STATUS_INVALID_ACL __constant_cpu_to_le32(0xC0000077)
+#define STATUS_INVALID_SID __constant_cpu_to_le32(0xC0000078)
+#define STATUS_INVALID_SECURITY_DESCR __constant_cpu_to_le32(0xC0000079)
+#define STATUS_PROCEDURE_NOT_FOUND __constant_cpu_to_le32(0xC000007A)
+#define STATUS_INVALID_IMAGE_FORMAT __constant_cpu_to_le32(0xC000007B)
+#define STATUS_NO_TOKEN __constant_cpu_to_le32(0xC000007C)
+#define STATUS_BAD_INHERITANCE_ACL __constant_cpu_to_le32(0xC000007D)
+#define STATUS_RANGE_NOT_LOCKED __constant_cpu_to_le32(0xC000007E)
+#define STATUS_DISK_FULL __constant_cpu_to_le32(0xC000007F)
+#define STATUS_SERVER_DISABLED __constant_cpu_to_le32(0xC0000080)
+#define STATUS_SERVER_NOT_DISABLED __constant_cpu_to_le32(0xC0000081)
+#define STATUS_TOO_MANY_GUIDS_REQUESTED __constant_cpu_to_le32(0xC0000082)
+#define STATUS_GUIDS_EXHAUSTED __constant_cpu_to_le32(0xC0000083)
+#define STATUS_INVALID_ID_AUTHORITY __constant_cpu_to_le32(0xC0000084)
+#define STATUS_AGENTS_EXHAUSTED __constant_cpu_to_le32(0xC0000085)
+#define STATUS_INVALID_VOLUME_LABEL __constant_cpu_to_le32(0xC0000086)
+#define STATUS_SECTION_NOT_EXTENDED __constant_cpu_to_le32(0xC0000087)
+#define STATUS_NOT_MAPPED_DATA __constant_cpu_to_le32(0xC0000088)
+#define STATUS_RESOURCE_DATA_NOT_FOUND __constant_cpu_to_le32(0xC0000089)
+#define STATUS_RESOURCE_TYPE_NOT_FOUND __constant_cpu_to_le32(0xC000008A)
+#define STATUS_RESOURCE_NAME_NOT_FOUND __constant_cpu_to_le32(0xC000008B)
+#define STATUS_ARRAY_BOUNDS_EXCEEDED __constant_cpu_to_le32(0xC000008C)
+#define STATUS_FLOAT_DENORMAL_OPERAND __constant_cpu_to_le32(0xC000008D)
+#define STATUS_FLOAT_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC000008E)
+#define STATUS_FLOAT_INEXACT_RESULT __constant_cpu_to_le32(0xC000008F)
+#define STATUS_FLOAT_INVALID_OPERATION __constant_cpu_to_le32(0xC0000090)
+#define STATUS_FLOAT_OVERFLOW __constant_cpu_to_le32(0xC0000091)
+#define STATUS_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000092)
+#define STATUS_FLOAT_UNDERFLOW __constant_cpu_to_le32(0xC0000093)
+#define STATUS_INTEGER_DIVIDE_BY_ZERO __constant_cpu_to_le32(0xC0000094)
+#define STATUS_INTEGER_OVERFLOW __constant_cpu_to_le32(0xC0000095)
+#define STATUS_PRIVILEGED_INSTRUCTION __constant_cpu_to_le32(0xC0000096)
+#define STATUS_TOO_MANY_PAGING_FILES __constant_cpu_to_le32(0xC0000097)
+#define STATUS_FILE_INVALID __constant_cpu_to_le32(0xC0000098)
+#define STATUS_ALLOTTED_SPACE_EXCEEDED __constant_cpu_to_le32(0xC0000099)
+#define STATUS_INSUFFICIENT_RESOURCES __constant_cpu_to_le32(0xC000009A)
+#define STATUS_DFS_EXIT_PATH_FOUND __constant_cpu_to_le32(0xC000009B)
+#define STATUS_DEVICE_DATA_ERROR __constant_cpu_to_le32(0xC000009C)
+#define STATUS_DEVICE_NOT_CONNECTED __constant_cpu_to_le32(0xC000009D)
+#define STATUS_DEVICE_POWER_FAILURE __constant_cpu_to_le32(0xC000009E)
+#define STATUS_FREE_VM_NOT_AT_BASE __constant_cpu_to_le32(0xC000009F)
+#define STATUS_MEMORY_NOT_ALLOCATED __constant_cpu_to_le32(0xC00000A0)
+#define STATUS_WORKING_SET_QUOTA __constant_cpu_to_le32(0xC00000A1)
+#define STATUS_MEDIA_WRITE_PROTECTED __constant_cpu_to_le32(0xC00000A2)
+#define STATUS_DEVICE_NOT_READY __constant_cpu_to_le32(0xC00000A3)
+#define STATUS_INVALID_GROUP_ATTRIBUTES __constant_cpu_to_le32(0xC00000A4)
+#define STATUS_BAD_IMPERSONATION_LEVEL __constant_cpu_to_le32(0xC00000A5)
+#define STATUS_CANT_OPEN_ANONYMOUS __constant_cpu_to_le32(0xC00000A6)
+#define STATUS_BAD_VALIDATION_CLASS __constant_cpu_to_le32(0xC00000A7)
+#define STATUS_BAD_TOKEN_TYPE __constant_cpu_to_le32(0xC00000A8)
+#define STATUS_BAD_MASTER_BOOT_RECORD __constant_cpu_to_le32(0xC00000A9)
+#define STATUS_INSTRUCTION_MISALIGNMENT __constant_cpu_to_le32(0xC00000AA)
+#define STATUS_INSTANCE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AB)
+#define STATUS_PIPE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00000AC)
+#define STATUS_INVALID_PIPE_STATE __constant_cpu_to_le32(0xC00000AD)
+#define STATUS_PIPE_BUSY __constant_cpu_to_le32(0xC00000AE)
+#define STATUS_ILLEGAL_FUNCTION __constant_cpu_to_le32(0xC00000AF)
+#define STATUS_PIPE_DISCONNECTED __constant_cpu_to_le32(0xC00000B0)
+#define STATUS_PIPE_CLOSING __constant_cpu_to_le32(0xC00000B1)
+#define STATUS_PIPE_CONNECTED __constant_cpu_to_le32(0xC00000B2)
+#define STATUS_PIPE_LISTENING __constant_cpu_to_le32(0xC00000B3)
+#define STATUS_INVALID_READ_MODE __constant_cpu_to_le32(0xC00000B4)
+#define STATUS_IO_TIMEOUT __constant_cpu_to_le32(0xC00000B5)
+#define STATUS_FILE_FORCED_CLOSED __constant_cpu_to_le32(0xC00000B6)
+#define STATUS_PROFILING_NOT_STARTED __constant_cpu_to_le32(0xC00000B7)
+#define STATUS_PROFILING_NOT_STOPPED __constant_cpu_to_le32(0xC00000B8)
+#define STATUS_COULD_NOT_INTERPRET __constant_cpu_to_le32(0xC00000B9)
+#define STATUS_FILE_IS_A_DIRECTORY __constant_cpu_to_le32(0xC00000BA)
+#define STATUS_NOT_SUPPORTED __constant_cpu_to_le32(0xC00000BB)
+#define STATUS_REMOTE_NOT_LISTENING __constant_cpu_to_le32(0xC00000BC)
+#define STATUS_DUPLICATE_NAME __constant_cpu_to_le32(0xC00000BD)
+#define STATUS_BAD_NETWORK_PATH __constant_cpu_to_le32(0xC00000BE)
+#define STATUS_NETWORK_BUSY __constant_cpu_to_le32(0xC00000BF)
+#define STATUS_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC00000C0)
+#define STATUS_TOO_MANY_COMMANDS __constant_cpu_to_le32(0xC00000C1)
+#define STATUS_ADAPTER_HARDWARE_ERROR __constant_cpu_to_le32(0xC00000C2)
+#define STATUS_INVALID_NETWORK_RESPONSE __constant_cpu_to_le32(0xC00000C3)
+#define STATUS_UNEXPECTED_NETWORK_ERROR __constant_cpu_to_le32(0xC00000C4)
+#define STATUS_BAD_REMOTE_ADAPTER __constant_cpu_to_le32(0xC00000C5)
+#define STATUS_PRINT_QUEUE_FULL __constant_cpu_to_le32(0xC00000C6)
+#define STATUS_NO_SPOOL_SPACE __constant_cpu_to_le32(0xC00000C7)
+#define STATUS_PRINT_CANCELLED __constant_cpu_to_le32(0xC00000C8)
+#define STATUS_NETWORK_NAME_DELETED __constant_cpu_to_le32(0xC00000C9)
+#define STATUS_NETWORK_ACCESS_DENIED __constant_cpu_to_le32(0xC00000CA)
+#define STATUS_BAD_DEVICE_TYPE __constant_cpu_to_le32(0xC00000CB)
+#define STATUS_BAD_NETWORK_NAME __constant_cpu_to_le32(0xC00000CC)
+#define STATUS_TOO_MANY_NAMES __constant_cpu_to_le32(0xC00000CD)
+#define STATUS_TOO_MANY_SESSIONS __constant_cpu_to_le32(0xC00000CE)
+#define STATUS_SHARING_PAUSED __constant_cpu_to_le32(0xC00000CF)
+#define STATUS_REQUEST_NOT_ACCEPTED __constant_cpu_to_le32(0xC00000D0)
+#define STATUS_REDIRECTOR_PAUSED __constant_cpu_to_le32(0xC00000D1)
+#define STATUS_NET_WRITE_FAULT __constant_cpu_to_le32(0xC00000D2)
+#define STATUS_PROFILING_AT_LIMIT __constant_cpu_to_le32(0xC00000D3)
+#define STATUS_NOT_SAME_DEVICE __constant_cpu_to_le32(0xC00000D4)
+#define STATUS_FILE_RENAMED __constant_cpu_to_le32(0xC00000D5)
+#define STATUS_VIRTUAL_CIRCUIT_CLOSED __constant_cpu_to_le32(0xC00000D6)
+#define STATUS_NO_SECURITY_ON_OBJECT __constant_cpu_to_le32(0xC00000D7)
+#define STATUS_CANT_WAIT __constant_cpu_to_le32(0xC00000D8)
+#define STATUS_PIPE_EMPTY __constant_cpu_to_le32(0xC00000D9)
+#define STATUS_CANT_ACCESS_DOMAIN_INFO __constant_cpu_to_le32(0xC00000DA)
+#define STATUS_CANT_TERMINATE_SELF __constant_cpu_to_le32(0xC00000DB)
+#define STATUS_INVALID_SERVER_STATE __constant_cpu_to_le32(0xC00000DC)
+#define STATUS_INVALID_DOMAIN_STATE __constant_cpu_to_le32(0xC00000DD)
+#define STATUS_INVALID_DOMAIN_ROLE __constant_cpu_to_le32(0xC00000DE)
+#define STATUS_NO_SUCH_DOMAIN __constant_cpu_to_le32(0xC00000DF)
+#define STATUS_DOMAIN_EXISTS __constant_cpu_to_le32(0xC00000E0)
+#define STATUS_DOMAIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00000E1)
+#define STATUS_OPLOCK_NOT_GRANTED __constant_cpu_to_le32(0xC00000E2)
+#define STATUS_INVALID_OPLOCK_PROTOCOL __constant_cpu_to_le32(0xC00000E3)
+#define STATUS_INTERNAL_DB_CORRUPTION __constant_cpu_to_le32(0xC00000E4)
+#define STATUS_INTERNAL_ERROR __constant_cpu_to_le32(0xC00000E5)
+#define STATUS_GENERIC_NOT_MAPPED __constant_cpu_to_le32(0xC00000E6)
+#define STATUS_BAD_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC00000E7)
+#define STATUS_INVALID_USER_BUFFER __constant_cpu_to_le32(0xC00000E8)
+#define STATUS_UNEXPECTED_IO_ERROR __constant_cpu_to_le32(0xC00000E9)
+#define STATUS_UNEXPECTED_MM_CREATE_ERR __constant_cpu_to_le32(0xC00000EA)
+#define STATUS_UNEXPECTED_MM_MAP_ERROR __constant_cpu_to_le32(0xC00000EB)
+#define STATUS_UNEXPECTED_MM_EXTEND_ERR __constant_cpu_to_le32(0xC00000EC)
+#define STATUS_NOT_LOGON_PROCESS __constant_cpu_to_le32(0xC00000ED)
+#define STATUS_LOGON_SESSION_EXISTS __constant_cpu_to_le32(0xC00000EE)
+#define STATUS_INVALID_PARAMETER_1 __constant_cpu_to_le32(0xC00000EF)
+#define STATUS_INVALID_PARAMETER_2 __constant_cpu_to_le32(0xC00000F0)
+#define STATUS_INVALID_PARAMETER_3 __constant_cpu_to_le32(0xC00000F1)
+#define STATUS_INVALID_PARAMETER_4 __constant_cpu_to_le32(0xC00000F2)
+#define STATUS_INVALID_PARAMETER_5 __constant_cpu_to_le32(0xC00000F3)
+#define STATUS_INVALID_PARAMETER_6 __constant_cpu_to_le32(0xC00000F4)
+#define STATUS_INVALID_PARAMETER_7 __constant_cpu_to_le32(0xC00000F5)
+#define STATUS_INVALID_PARAMETER_8 __constant_cpu_to_le32(0xC00000F6)
+#define STATUS_INVALID_PARAMETER_9 __constant_cpu_to_le32(0xC00000F7)
+#define STATUS_INVALID_PARAMETER_10 __constant_cpu_to_le32(0xC00000F8)
+#define STATUS_INVALID_PARAMETER_11 __constant_cpu_to_le32(0xC00000F9)
+#define STATUS_INVALID_PARAMETER_12 __constant_cpu_to_le32(0xC00000FA)
+#define STATUS_REDIRECTOR_NOT_STARTED __constant_cpu_to_le32(0xC00000FB)
+#define STATUS_REDIRECTOR_STARTED __constant_cpu_to_le32(0xC00000FC)
+#define STATUS_STACK_OVERFLOW __constant_cpu_to_le32(0xC00000FD)
+#define STATUS_NO_SUCH_PACKAGE __constant_cpu_to_le32(0xC00000FE)
+#define STATUS_BAD_FUNCTION_TABLE __constant_cpu_to_le32(0xC00000FF)
+#define STATUS_VARIABLE_NOT_FOUND __constant_cpu_to_le32(0xC0000100)
+#define STATUS_DIRECTORY_NOT_EMPTY __constant_cpu_to_le32(0xC0000101)
+#define STATUS_FILE_CORRUPT_ERROR __constant_cpu_to_le32(0xC0000102)
+#define STATUS_NOT_A_DIRECTORY __constant_cpu_to_le32(0xC0000103)
+#define STATUS_BAD_LOGON_SESSION_STATE __constant_cpu_to_le32(0xC0000104)
+#define STATUS_LOGON_SESSION_COLLISION __constant_cpu_to_le32(0xC0000105)
+#define STATUS_NAME_TOO_LONG __constant_cpu_to_le32(0xC0000106)
+#define STATUS_FILES_OPEN __constant_cpu_to_le32(0xC0000107)
+#define STATUS_CONNECTION_IN_USE __constant_cpu_to_le32(0xC0000108)
+#define STATUS_MESSAGE_NOT_FOUND __constant_cpu_to_le32(0xC0000109)
+#define STATUS_PROCESS_IS_TERMINATING __constant_cpu_to_le32(0xC000010A)
+#define STATUS_INVALID_LOGON_TYPE __constant_cpu_to_le32(0xC000010B)
+#define STATUS_NO_GUID_TRANSLATION __constant_cpu_to_le32(0xC000010C)
+#define STATUS_CANNOT_IMPERSONATE __constant_cpu_to_le32(0xC000010D)
+#define STATUS_IMAGE_ALREADY_LOADED __constant_cpu_to_le32(0xC000010E)
+#define STATUS_ABIOS_NOT_PRESENT __constant_cpu_to_le32(0xC000010F)
+#define STATUS_ABIOS_LID_NOT_EXIST __constant_cpu_to_le32(0xC0000110)
+#define STATUS_ABIOS_LID_ALREADY_OWNED __constant_cpu_to_le32(0xC0000111)
+#define STATUS_ABIOS_NOT_LID_OWNER __constant_cpu_to_le32(0xC0000112)
+#define STATUS_ABIOS_INVALID_COMMAND __constant_cpu_to_le32(0xC0000113)
+#define STATUS_ABIOS_INVALID_LID __constant_cpu_to_le32(0xC0000114)
+#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE __constant_cpu_to_le32(0xC0000115)
+#define STATUS_ABIOS_INVALID_SELECTOR __constant_cpu_to_le32(0xC0000116)
+#define STATUS_NO_LDT __constant_cpu_to_le32(0xC0000117)
+#define STATUS_INVALID_LDT_SIZE __constant_cpu_to_le32(0xC0000118)
+#define STATUS_INVALID_LDT_OFFSET __constant_cpu_to_le32(0xC0000119)
+#define STATUS_INVALID_LDT_DESCRIPTOR __constant_cpu_to_le32(0xC000011A)
+#define STATUS_INVALID_IMAGE_NE_FORMAT __constant_cpu_to_le32(0xC000011B)
+#define STATUS_RXACT_INVALID_STATE __constant_cpu_to_le32(0xC000011C)
+#define STATUS_RXACT_COMMIT_FAILURE __constant_cpu_to_le32(0xC000011D)
+#define STATUS_MAPPED_FILE_SIZE_ZERO __constant_cpu_to_le32(0xC000011E)
+#define STATUS_TOO_MANY_OPENED_FILES __constant_cpu_to_le32(0xC000011F)
+#define STATUS_CANCELLED __constant_cpu_to_le32(0xC0000120)
+#define STATUS_CANNOT_DELETE __constant_cpu_to_le32(0xC0000121)
+#define STATUS_INVALID_COMPUTER_NAME __constant_cpu_to_le32(0xC0000122)
+#define STATUS_FILE_DELETED __constant_cpu_to_le32(0xC0000123)
+#define STATUS_SPECIAL_ACCOUNT __constant_cpu_to_le32(0xC0000124)
+#define STATUS_SPECIAL_GROUP __constant_cpu_to_le32(0xC0000125)
+#define STATUS_SPECIAL_USER __constant_cpu_to_le32(0xC0000126)
+#define STATUS_MEMBERS_PRIMARY_GROUP __constant_cpu_to_le32(0xC0000127)
+#define STATUS_FILE_CLOSED __constant_cpu_to_le32(0xC0000128)
+#define STATUS_TOO_MANY_THREADS __constant_cpu_to_le32(0xC0000129)
+#define STATUS_THREAD_NOT_IN_PROCESS __constant_cpu_to_le32(0xC000012A)
+#define STATUS_TOKEN_ALREADY_IN_USE __constant_cpu_to_le32(0xC000012B)
+#define STATUS_PAGEFILE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC000012C)
+#define STATUS_COMMITMENT_LIMIT __constant_cpu_to_le32(0xC000012D)
+#define STATUS_INVALID_IMAGE_LE_FORMAT __constant_cpu_to_le32(0xC000012E)
+#define STATUS_INVALID_IMAGE_NOT_MZ __constant_cpu_to_le32(0xC000012F)
+#define STATUS_INVALID_IMAGE_PROTECT __constant_cpu_to_le32(0xC0000130)
+#define STATUS_INVALID_IMAGE_WIN_16 __constant_cpu_to_le32(0xC0000131)
+#define STATUS_LOGON_SERVER_CONFLICT __constant_cpu_to_le32(0xC0000132)
+#define STATUS_TIME_DIFFERENCE_AT_DC __constant_cpu_to_le32(0xC0000133)
+#define STATUS_SYNCHRONIZATION_REQUIRED __constant_cpu_to_le32(0xC0000134)
+#define STATUS_DLL_NOT_FOUND __constant_cpu_to_le32(0xC0000135)
+#define STATUS_OPEN_FAILED __constant_cpu_to_le32(0xC0000136)
+#define STATUS_IO_PRIVILEGE_FAILED __constant_cpu_to_le32(0xC0000137)
+#define STATUS_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000138)
+#define STATUS_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000139)
+#define STATUS_CONTROL_C_EXIT __constant_cpu_to_le32(0xC000013A)
+#define STATUS_LOCAL_DISCONNECT __constant_cpu_to_le32(0xC000013B)
+#define STATUS_REMOTE_DISCONNECT __constant_cpu_to_le32(0xC000013C)
+#define STATUS_REMOTE_RESOURCES __constant_cpu_to_le32(0xC000013D)
+#define STATUS_LINK_FAILED __constant_cpu_to_le32(0xC000013E)
+#define STATUS_LINK_TIMEOUT __constant_cpu_to_le32(0xC000013F)
+#define STATUS_INVALID_CONNECTION __constant_cpu_to_le32(0xC0000140)
+#define STATUS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0000141)
+#define STATUS_DLL_INIT_FAILED __constant_cpu_to_le32(0xC0000142)
+#define STATUS_MISSING_SYSTEMFILE __constant_cpu_to_le32(0xC0000143)
+#define STATUS_UNHANDLED_EXCEPTION __constant_cpu_to_le32(0xC0000144)
+#define STATUS_APP_INIT_FAILURE __constant_cpu_to_le32(0xC0000145)
+#define STATUS_PAGEFILE_CREATE_FAILED __constant_cpu_to_le32(0xC0000146)
+#define STATUS_NO_PAGEFILE __constant_cpu_to_le32(0xC0000147)
+#define STATUS_INVALID_LEVEL __constant_cpu_to_le32(0xC0000148)
+#define STATUS_WRONG_PASSWORD_CORE __constant_cpu_to_le32(0xC0000149)
+#define STATUS_ILLEGAL_FLOAT_CONTEXT __constant_cpu_to_le32(0xC000014A)
+#define STATUS_PIPE_BROKEN __constant_cpu_to_le32(0xC000014B)
+#define STATUS_REGISTRY_CORRUPT __constant_cpu_to_le32(0xC000014C)
+#define STATUS_REGISTRY_IO_FAILED __constant_cpu_to_le32(0xC000014D)
+#define STATUS_NO_EVENT_PAIR __constant_cpu_to_le32(0xC000014E)
+#define STATUS_UNRECOGNIZED_VOLUME __constant_cpu_to_le32(0xC000014F)
+#define STATUS_SERIAL_NO_DEVICE_INITED __constant_cpu_to_le32(0xC0000150)
+#define STATUS_NO_SUCH_ALIAS __constant_cpu_to_le32(0xC0000151)
+#define STATUS_MEMBER_NOT_IN_ALIAS __constant_cpu_to_le32(0xC0000152)
+#define STATUS_MEMBER_IN_ALIAS __constant_cpu_to_le32(0xC0000153)
+#define STATUS_ALIAS_EXISTS __constant_cpu_to_le32(0xC0000154)
+#define STATUS_LOGON_NOT_GRANTED __constant_cpu_to_le32(0xC0000155)
+#define STATUS_TOO_MANY_SECRETS __constant_cpu_to_le32(0xC0000156)
+#define STATUS_SECRET_TOO_LONG __constant_cpu_to_le32(0xC0000157)
+#define STATUS_INTERNAL_DB_ERROR __constant_cpu_to_le32(0xC0000158)
+#define STATUS_FULLSCREEN_MODE __constant_cpu_to_le32(0xC0000159)
+#define STATUS_TOO_MANY_CONTEXT_IDS __constant_cpu_to_le32(0xC000015A)
+#define STATUS_LOGON_TYPE_NOT_GRANTED __constant_cpu_to_le32(0xC000015B)
+#define STATUS_NOT_REGISTRY_FILE __constant_cpu_to_le32(0xC000015C)
+#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000015D)
+#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR __constant_cpu_to_le32(0xC000015E)
+#define STATUS_FT_MISSING_MEMBER __constant_cpu_to_le32(0xC000015F)
+#define STATUS_ILL_FORMED_SERVICE_ENTRY __constant_cpu_to_le32(0xC0000160)
+#define STATUS_ILLEGAL_CHARACTER __constant_cpu_to_le32(0xC0000161)
+#define STATUS_UNMAPPABLE_CHARACTER __constant_cpu_to_le32(0xC0000162)
+#define STATUS_UNDEFINED_CHARACTER __constant_cpu_to_le32(0xC0000163)
+#define STATUS_FLOPPY_VOLUME __constant_cpu_to_le32(0xC0000164)
+#define STATUS_FLOPPY_ID_MARK_NOT_FOUND __constant_cpu_to_le32(0xC0000165)
+#define STATUS_FLOPPY_WRONG_CYLINDER __constant_cpu_to_le32(0xC0000166)
+#define STATUS_FLOPPY_UNKNOWN_ERROR __constant_cpu_to_le32(0xC0000167)
+#define STATUS_FLOPPY_BAD_REGISTERS __constant_cpu_to_le32(0xC0000168)
+#define STATUS_DISK_RECALIBRATE_FAILED __constant_cpu_to_le32(0xC0000169)
+#define STATUS_DISK_OPERATION_FAILED __constant_cpu_to_le32(0xC000016A)
+#define STATUS_DISK_RESET_FAILED __constant_cpu_to_le32(0xC000016B)
+#define STATUS_SHARED_IRQ_BUSY __constant_cpu_to_le32(0xC000016C)
+#define STATUS_FT_ORPHANING __constant_cpu_to_le32(0xC000016D)
+#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT __constant_cpu_to_le32(0xC000016E)
+#define STATUS_PARTITION_FAILURE __constant_cpu_to_le32(0xC0000172)
+#define STATUS_INVALID_BLOCK_LENGTH __constant_cpu_to_le32(0xC0000173)
+#define STATUS_DEVICE_NOT_PARTITIONED __constant_cpu_to_le32(0xC0000174)
+#define STATUS_UNABLE_TO_LOCK_MEDIA __constant_cpu_to_le32(0xC0000175)
+#define STATUS_UNABLE_TO_UNLOAD_MEDIA __constant_cpu_to_le32(0xC0000176)
+#define STATUS_EOM_OVERFLOW __constant_cpu_to_le32(0xC0000177)
+#define STATUS_NO_MEDIA __constant_cpu_to_le32(0xC0000178)
+#define STATUS_NO_SUCH_MEMBER __constant_cpu_to_le32(0xC000017A)
+#define STATUS_INVALID_MEMBER __constant_cpu_to_le32(0xC000017B)
+#define STATUS_KEY_DELETED __constant_cpu_to_le32(0xC000017C)
+#define STATUS_NO_LOG_SPACE __constant_cpu_to_le32(0xC000017D)
+#define STATUS_TOO_MANY_SIDS __constant_cpu_to_le32(0xC000017E)
+#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED __constant_cpu_to_le32(0xC000017F)
+#define STATUS_KEY_HAS_CHILDREN __constant_cpu_to_le32(0xC0000180)
+#define STATUS_CHILD_MUST_BE_VOLATILE __constant_cpu_to_le32(0xC0000181)
+#define STATUS_DEVICE_CONFIGURATION_ERROR __constant_cpu_to_le32(0xC0000182)
+#define STATUS_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC0000183)
+#define STATUS_INVALID_DEVICE_STATE __constant_cpu_to_le32(0xC0000184)
+#define STATUS_IO_DEVICE_ERROR __constant_cpu_to_le32(0xC0000185)
+#define STATUS_DEVICE_PROTOCOL_ERROR __constant_cpu_to_le32(0xC0000186)
+#define STATUS_BACKUP_CONTROLLER __constant_cpu_to_le32(0xC0000187)
+#define STATUS_LOG_FILE_FULL __constant_cpu_to_le32(0xC0000188)
+#define STATUS_TOO_LATE __constant_cpu_to_le32(0xC0000189)
+#define STATUS_NO_TRUST_LSA_SECRET __constant_cpu_to_le32(0xC000018A)
+#define STATUS_NO_TRUST_SAM_ACCOUNT __constant_cpu_to_le32(0xC000018B)
+#define STATUS_TRUSTED_DOMAIN_FAILURE __constant_cpu_to_le32(0xC000018C)
+#define STATUS_TRUSTED_RELATIONSHIP_FAILURE __constant_cpu_to_le32(0xC000018D)
+#define STATUS_EVENTLOG_FILE_CORRUPT __constant_cpu_to_le32(0xC000018E)
+#define STATUS_EVENTLOG_CANT_START __constant_cpu_to_le32(0xC000018F)
+#define STATUS_TRUST_FAILURE __constant_cpu_to_le32(0xC0000190)
+#define STATUS_MUTANT_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC0000191)
+#define STATUS_NETLOGON_NOT_STARTED __constant_cpu_to_le32(0xC0000192)
+#define STATUS_ACCOUNT_EXPIRED __constant_cpu_to_le32(0xC0000193)
+#define STATUS_POSSIBLE_DEADLOCK __constant_cpu_to_le32(0xC0000194)
+#define STATUS_NETWORK_CREDENTIAL_CONFLICT __constant_cpu_to_le32(0xC0000195)
+#define STATUS_REMOTE_SESSION_LIMIT __constant_cpu_to_le32(0xC0000196)
+#define STATUS_EVENTLOG_FILE_CHANGED __constant_cpu_to_le32(0xC0000197)
+#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000198)
+#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT __constant_cpu_to_le32(0xC0000199)
+#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT __constant_cpu_to_le32(0xC000019A)
+#define STATUS_DOMAIN_TRUST_INCONSISTENT __constant_cpu_to_le32(0xC000019B)
+#define STATUS_FS_DRIVER_REQUIRED __constant_cpu_to_le32(0xC000019C)
+#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL __constant_cpu_to_le32(0xC000019D)
+#define STATUS_NETWORK_OPEN_RESTRICTION __constant_cpu_to_le32(0xC0000201)
+#define STATUS_NO_USER_SESSION_KEY __constant_cpu_to_le32(0xC0000202)
+#define STATUS_USER_SESSION_DELETED __constant_cpu_to_le32(0xC0000203)
+#define STATUS_RESOURCE_LANG_NOT_FOUND __constant_cpu_to_le32(0xC0000204)
+#define STATUS_INSUFF_SERVER_RESOURCES __constant_cpu_to_le32(0xC0000205)
+#define STATUS_INVALID_BUFFER_SIZE __constant_cpu_to_le32(0xC0000206)
+#define STATUS_INVALID_ADDRESS_COMPONENT __constant_cpu_to_le32(0xC0000207)
+#define STATUS_INVALID_ADDRESS_WILDCARD __constant_cpu_to_le32(0xC0000208)
+#define STATUS_TOO_MANY_ADDRESSES __constant_cpu_to_le32(0xC0000209)
+#define STATUS_ADDRESS_ALREADY_EXISTS __constant_cpu_to_le32(0xC000020A)
+#define STATUS_ADDRESS_CLOSED __constant_cpu_to_le32(0xC000020B)
+#define STATUS_CONNECTION_DISCONNECTED __constant_cpu_to_le32(0xC000020C)
+#define STATUS_CONNECTION_RESET __constant_cpu_to_le32(0xC000020D)
+#define STATUS_TOO_MANY_NODES __constant_cpu_to_le32(0xC000020E)
+#define STATUS_TRANSACTION_ABORTED __constant_cpu_to_le32(0xC000020F)
+#define STATUS_TRANSACTION_TIMED_OUT __constant_cpu_to_le32(0xC0000210)
+#define STATUS_TRANSACTION_NO_RELEASE __constant_cpu_to_le32(0xC0000211)
+#define STATUS_TRANSACTION_NO_MATCH __constant_cpu_to_le32(0xC0000212)
+#define STATUS_TRANSACTION_RESPONDED __constant_cpu_to_le32(0xC0000213)
+#define STATUS_TRANSACTION_INVALID_ID __constant_cpu_to_le32(0xC0000214)
+#define STATUS_TRANSACTION_INVALID_TYPE __constant_cpu_to_le32(0xC0000215)
+#define STATUS_NOT_SERVER_SESSION __constant_cpu_to_le32(0xC0000216)
+#define STATUS_NOT_CLIENT_SESSION __constant_cpu_to_le32(0xC0000217)
+#define STATUS_CANNOT_LOAD_REGISTRY_FILE __constant_cpu_to_le32(0xC0000218)
+#define STATUS_DEBUG_ATTACH_FAILED __constant_cpu_to_le32(0xC0000219)
+#define STATUS_SYSTEM_PROCESS_TERMINATED __constant_cpu_to_le32(0xC000021A)
+#define STATUS_DATA_NOT_ACCEPTED __constant_cpu_to_le32(0xC000021B)
+#define STATUS_NO_BROWSER_SERVERS_FOUND __constant_cpu_to_le32(0xC000021C)
+#define STATUS_VDM_HARD_ERROR __constant_cpu_to_le32(0xC000021D)
+#define STATUS_DRIVER_CANCEL_TIMEOUT __constant_cpu_to_le32(0xC000021E)
+#define STATUS_REPLY_MESSAGE_MISMATCH __constant_cpu_to_le32(0xC000021F)
+#define STATUS_MAPPED_ALIGNMENT __constant_cpu_to_le32(0xC0000220)
+#define STATUS_IMAGE_CHECKSUM_MISMATCH __constant_cpu_to_le32(0xC0000221)
+#define STATUS_LOST_WRITEBEHIND_DATA __constant_cpu_to_le32(0xC0000222)
+#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID __constant_cpu_to_le32(0xC0000223)
+#define STATUS_PASSWORD_MUST_CHANGE __constant_cpu_to_le32(0xC0000224)
+#define STATUS_NOT_FOUND __constant_cpu_to_le32(0xC0000225)
+#define STATUS_NOT_TINY_STREAM __constant_cpu_to_le32(0xC0000226)
+#define STATUS_RECOVERY_FAILURE __constant_cpu_to_le32(0xC0000227)
+#define STATUS_STACK_OVERFLOW_READ __constant_cpu_to_le32(0xC0000228)
+#define STATUS_FAIL_CHECK __constant_cpu_to_le32(0xC0000229)
+#define STATUS_DUPLICATE_OBJECTID __constant_cpu_to_le32(0xC000022A)
+#define STATUS_OBJECTID_EXISTS __constant_cpu_to_le32(0xC000022B)
+#define STATUS_CONVERT_TO_LARGE __constant_cpu_to_le32(0xC000022C)
+#define STATUS_RETRY __constant_cpu_to_le32(0xC000022D)
+#define STATUS_FOUND_OUT_OF_SCOPE __constant_cpu_to_le32(0xC000022E)
+#define STATUS_ALLOCATE_BUCKET __constant_cpu_to_le32(0xC000022F)
+#define STATUS_PROPSET_NOT_FOUND __constant_cpu_to_le32(0xC0000230)
+#define STATUS_MARSHALL_OVERFLOW __constant_cpu_to_le32(0xC0000231)
+#define STATUS_INVALID_VARIANT __constant_cpu_to_le32(0xC0000232)
+#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND __constant_cpu_to_le32(0xC0000233)
+#define STATUS_ACCOUNT_LOCKED_OUT __constant_cpu_to_le32(0xC0000234)
+#define STATUS_HANDLE_NOT_CLOSABLE __constant_cpu_to_le32(0xC0000235)
+#define STATUS_CONNECTION_REFUSED __constant_cpu_to_le32(0xC0000236)
+#define STATUS_GRACEFUL_DISCONNECT __constant_cpu_to_le32(0xC0000237)
+#define STATUS_ADDRESS_ALREADY_ASSOCIATED __constant_cpu_to_le32(0xC0000238)
+#define STATUS_ADDRESS_NOT_ASSOCIATED __constant_cpu_to_le32(0xC0000239)
+#define STATUS_CONNECTION_INVALID __constant_cpu_to_le32(0xC000023A)
+#define STATUS_CONNECTION_ACTIVE __constant_cpu_to_le32(0xC000023B)
+#define STATUS_NETWORK_UNREACHABLE __constant_cpu_to_le32(0xC000023C)
+#define STATUS_HOST_UNREACHABLE __constant_cpu_to_le32(0xC000023D)
+#define STATUS_PROTOCOL_UNREACHABLE __constant_cpu_to_le32(0xC000023E)
+#define STATUS_PORT_UNREACHABLE __constant_cpu_to_le32(0xC000023F)
+#define STATUS_REQUEST_ABORTED __constant_cpu_to_le32(0xC0000240)
+#define STATUS_CONNECTION_ABORTED __constant_cpu_to_le32(0xC0000241)
+#define STATUS_BAD_COMPRESSION_BUFFER __constant_cpu_to_le32(0xC0000242)
+#define STATUS_USER_MAPPED_FILE __constant_cpu_to_le32(0xC0000243)
+#define STATUS_AUDIT_FAILED __constant_cpu_to_le32(0xC0000244)
+#define STATUS_TIMER_RESOLUTION_NOT_SET __constant_cpu_to_le32(0xC0000245)
+#define STATUS_CONNECTION_COUNT_LIMIT __constant_cpu_to_le32(0xC0000246)
+#define STATUS_LOGIN_TIME_RESTRICTION __constant_cpu_to_le32(0xC0000247)
+#define STATUS_LOGIN_WKSTA_RESTRICTION __constant_cpu_to_le32(0xC0000248)
+#define STATUS_IMAGE_MP_UP_MISMATCH __constant_cpu_to_le32(0xC0000249)
+#define STATUS_INSUFFICIENT_LOGON_INFO __constant_cpu_to_le32(0xC0000250)
+#define STATUS_BAD_DLL_ENTRYPOINT __constant_cpu_to_le32(0xC0000251)
+#define STATUS_BAD_SERVICE_ENTRYPOINT __constant_cpu_to_le32(0xC0000252)
+#define STATUS_LPC_REPLY_LOST __constant_cpu_to_le32(0xC0000253)
+#define STATUS_IP_ADDRESS_CONFLICT1 __constant_cpu_to_le32(0xC0000254)
+#define STATUS_IP_ADDRESS_CONFLICT2 __constant_cpu_to_le32(0xC0000255)
+#define STATUS_REGISTRY_QUOTA_LIMIT __constant_cpu_to_le32(0xC0000256)
+#define STATUS_PATH_NOT_COVERED __constant_cpu_to_le32(0xC0000257)
+#define STATUS_NO_CALLBACK_ACTIVE __constant_cpu_to_le32(0xC0000258)
+#define STATUS_LICENSE_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000259)
+#define STATUS_PWD_TOO_SHORT __constant_cpu_to_le32(0xC000025A)
+#define STATUS_PWD_TOO_RECENT __constant_cpu_to_le32(0xC000025B)
+#define STATUS_PWD_HISTORY_CONFLICT __constant_cpu_to_le32(0xC000025C)
+#define STATUS_PLUGPLAY_NO_DEVICE __constant_cpu_to_le32(0xC000025E)
+#define STATUS_UNSUPPORTED_COMPRESSION __constant_cpu_to_le32(0xC000025F)
+#define STATUS_INVALID_HW_PROFILE __constant_cpu_to_le32(0xC0000260)
+#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH __constant_cpu_to_le32(0xC0000261)
+#define STATUS_DRIVER_ORDINAL_NOT_FOUND __constant_cpu_to_le32(0xC0000262)
+#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND __constant_cpu_to_le32(0xC0000263)
+#define STATUS_RESOURCE_NOT_OWNED __constant_cpu_to_le32(0xC0000264)
+#define STATUS_TOO_MANY_LINKS __constant_cpu_to_le32(0xC0000265)
+#define STATUS_QUOTA_LIST_INCONSISTENT __constant_cpu_to_le32(0xC0000266)
+#define STATUS_FILE_IS_OFFLINE __constant_cpu_to_le32(0xC0000267)
+#define STATUS_EVALUATION_EXPIRATION __constant_cpu_to_le32(0xC0000268)
+#define STATUS_ILLEGAL_DLL_RELOCATION __constant_cpu_to_le32(0xC0000269)
+#define STATUS_LICENSE_VIOLATION __constant_cpu_to_le32(0xC000026A)
+#define STATUS_DLL_INIT_FAILED_LOGOFF __constant_cpu_to_le32(0xC000026B)
+#define STATUS_DRIVER_UNABLE_TO_LOAD __constant_cpu_to_le32(0xC000026C)
+#define STATUS_DFS_UNAVAILABLE __constant_cpu_to_le32(0xC000026D)
+#define STATUS_VOLUME_DISMOUNTED __constant_cpu_to_le32(0xC000026E)
+#define STATUS_WX86_INTERNAL_ERROR __constant_cpu_to_le32(0xC000026F)
+#define STATUS_WX86_FLOAT_STACK_CHECK __constant_cpu_to_le32(0xC0000270)
+#define STATUS_VALIDATE_CONTINUE __constant_cpu_to_le32(0xC0000271)
+#define STATUS_NO_MATCH __constant_cpu_to_le32(0xC0000272)
+#define STATUS_NO_MORE_MATCHES __constant_cpu_to_le32(0xC0000273)
+#define STATUS_NOT_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000275)
+#define STATUS_IO_REPARSE_TAG_INVALID __constant_cpu_to_le32(0xC0000276)
+#define STATUS_IO_REPARSE_TAG_MISMATCH __constant_cpu_to_le32(0xC0000277)
+#define STATUS_IO_REPARSE_DATA_INVALID __constant_cpu_to_le32(0xC0000278)
+#define STATUS_IO_REPARSE_TAG_NOT_HANDLED __constant_cpu_to_le32(0xC0000279)
+#define STATUS_REPARSE_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000280)
+#define STATUS_DIRECTORY_IS_A_REPARSE_POINT __constant_cpu_to_le32(0xC0000281)
+#define STATUS_RANGE_LIST_CONFLICT __constant_cpu_to_le32(0xC0000282)
+#define STATUS_SOURCE_ELEMENT_EMPTY __constant_cpu_to_le32(0xC0000283)
+#define STATUS_DESTINATION_ELEMENT_FULL __constant_cpu_to_le32(0xC0000284)
+#define STATUS_ILLEGAL_ELEMENT_ADDRESS __constant_cpu_to_le32(0xC0000285)
+#define STATUS_MAGAZINE_NOT_PRESENT __constant_cpu_to_le32(0xC0000286)
+#define STATUS_REINITIALIZATION_NEEDED __constant_cpu_to_le32(0xC0000287)
+#define STATUS_ENCRYPTION_FAILED __constant_cpu_to_le32(0xC000028A)
+#define STATUS_DECRYPTION_FAILED __constant_cpu_to_le32(0xC000028B)
+#define STATUS_RANGE_NOT_FOUND __constant_cpu_to_le32(0xC000028C)
+#define STATUS_NO_RECOVERY_POLICY __constant_cpu_to_le32(0xC000028D)
+#define STATUS_NO_EFS __constant_cpu_to_le32(0xC000028E)
+#define STATUS_WRONG_EFS __constant_cpu_to_le32(0xC000028F)
+#define STATUS_NO_USER_KEYS __constant_cpu_to_le32(0xC0000290)
+#define STATUS_FILE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0000291)
+#define STATUS_NOT_EXPORT_FORMAT __constant_cpu_to_le32(0xC0000292)
+#define STATUS_FILE_ENCRYPTED __constant_cpu_to_le32(0xC0000293)
+#define STATUS_WMI_GUID_NOT_FOUND __constant_cpu_to_le32(0xC0000295)
+#define STATUS_WMI_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC0000296)
+#define STATUS_WMI_ITEMID_NOT_FOUND __constant_cpu_to_le32(0xC0000297)
+#define STATUS_WMI_TRY_AGAIN __constant_cpu_to_le32(0xC0000298)
+#define STATUS_SHARED_POLICY __constant_cpu_to_le32(0xC0000299)
+#define STATUS_POLICY_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC000029A)
+#define STATUS_POLICY_ONLY_IN_DS __constant_cpu_to_le32(0xC000029B)
+#define STATUS_VOLUME_NOT_UPGRADED __constant_cpu_to_le32(0xC000029C)
+#define STATUS_REMOTE_STORAGE_NOT_ACTIVE __constant_cpu_to_le32(0xC000029D)
+#define STATUS_REMOTE_STORAGE_MEDIA_ERROR __constant_cpu_to_le32(0xC000029E)
+#define STATUS_NO_TRACKING_SERVICE __constant_cpu_to_le32(0xC000029F)
+#define STATUS_SERVER_SID_MISMATCH __constant_cpu_to_le32(0xC00002A0)
+#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE __constant_cpu_to_le32(0xC00002A1)
+#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX __constant_cpu_to_le32(0xC00002A2)
+#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED __constant_cpu_to_le32(0xC00002A3)
+#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS __constant_cpu_to_le32(0xC00002A4)
+#define STATUS_DS_BUSY __constant_cpu_to_le32(0xC00002A5)
+#define STATUS_DS_UNAVAILABLE __constant_cpu_to_le32(0xC00002A6)
+#define STATUS_DS_NO_RIDS_ALLOCATED __constant_cpu_to_le32(0xC00002A7)
+#define STATUS_DS_NO_MORE_RIDS __constant_cpu_to_le32(0xC00002A8)
+#define STATUS_DS_INCORRECT_ROLE_OWNER __constant_cpu_to_le32(0xC00002A9)
+#define STATUS_DS_RIDMGR_INIT_ERROR __constant_cpu_to_le32(0xC00002AA)
+#define STATUS_DS_OBJ_CLASS_VIOLATION __constant_cpu_to_le32(0xC00002AB)
+#define STATUS_DS_CANT_ON_NON_LEAF __constant_cpu_to_le32(0xC00002AC)
+#define STATUS_DS_CANT_ON_RDN __constant_cpu_to_le32(0xC00002AD)
+#define STATUS_DS_CANT_MOD_OBJ_CLASS __constant_cpu_to_le32(0xC00002AE)
+#define STATUS_DS_CROSS_DOM_MOVE_FAILED __constant_cpu_to_le32(0xC00002AF)
+#define STATUS_DS_GC_NOT_AVAILABLE __constant_cpu_to_le32(0xC00002B0)
+#define STATUS_DIRECTORY_SERVICE_REQUIRED __constant_cpu_to_le32(0xC00002B1)
+#define STATUS_REPARSE_ATTRIBUTE_CONFLICT __constant_cpu_to_le32(0xC00002B2)
+#define STATUS_CANT_ENABLE_DENY_ONLY __constant_cpu_to_le32(0xC00002B3)
+#define STATUS_FLOAT_MULTIPLE_FAULTS __constant_cpu_to_le32(0xC00002B4)
+#define STATUS_FLOAT_MULTIPLE_TRAPS __constant_cpu_to_le32(0xC00002B5)
+#define STATUS_DEVICE_REMOVED __constant_cpu_to_le32(0xC00002B6)
+#define STATUS_JOURNAL_DELETE_IN_PROGRESS __constant_cpu_to_le32(0xC00002B7)
+#define STATUS_JOURNAL_NOT_ACTIVE __constant_cpu_to_le32(0xC00002B8)
+#define STATUS_NOINTERFACE __constant_cpu_to_le32(0xC00002B9)
+#define STATUS_DS_ADMIN_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC00002C1)
+#define STATUS_DRIVER_FAILED_SLEEP __constant_cpu_to_le32(0xC00002C2)
+#define STATUS_MUTUAL_AUTHENTICATION_FAILED __constant_cpu_to_le32(0xC00002C3)
+#define STATUS_CORRUPT_SYSTEM_FILE __constant_cpu_to_le32(0xC00002C4)
+#define STATUS_DATATYPE_MISALIGNMENT_ERROR __constant_cpu_to_le32(0xC00002C5)
+#define STATUS_WMI_READ_ONLY __constant_cpu_to_le32(0xC00002C6)
+#define STATUS_WMI_SET_FAILURE __constant_cpu_to_le32(0xC00002C7)
+#define STATUS_COMMITMENT_MINIMUM __constant_cpu_to_le32(0xC00002C8)
+#define STATUS_REG_NAT_CONSUMPTION __constant_cpu_to_le32(0xC00002C9)
+#define STATUS_TRANSPORT_FULL __constant_cpu_to_le32(0xC00002CA)
+#define STATUS_DS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002CB)
+#define STATUS_ONLY_IF_CONNECTED __constant_cpu_to_le32(0xC00002CC)
+#define STATUS_DS_SENSITIVE_GROUP_VIOLATION __constant_cpu_to_le32(0xC00002CD)
+#define STATUS_PNP_RESTART_ENUMERATION __constant_cpu_to_le32(0xC00002CE)
+#define STATUS_JOURNAL_ENTRY_DELETED __constant_cpu_to_le32(0xC00002CF)
+#define STATUS_DS_CANT_MOD_PRIMARYGROUPID __constant_cpu_to_le32(0xC00002D0)
+#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE __constant_cpu_to_le32(0xC00002D1)
+#define STATUS_PNP_REBOOT_REQUIRED __constant_cpu_to_le32(0xC00002D2)
+#define STATUS_POWER_STATE_INVALID __constant_cpu_to_le32(0xC00002D3)
+#define STATUS_DS_INVALID_GROUP_TYPE __constant_cpu_to_le32(0xC00002D4)
+#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D5)
+#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN __constant_cpu_to_le32(0xC00002D6)
+#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D7)
+#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC00002D8)
+#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002D9)
+#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER __constant_cpu_to_le32(0xC00002DA)
+#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER __constant_cpu_to_le32(0xC00002DB)
+#define STATUS_DS_HAVE_PRIMARY_MEMBERS __constant_cpu_to_le32(0xC00002DC)
+#define STATUS_WMI_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002DD)
+#define STATUS_INSUFFICIENT_POWER __constant_cpu_to_le32(0xC00002DE)
+#define STATUS_SAM_NEED_BOOTKEY_PASSWORD __constant_cpu_to_le32(0xC00002DF)
+#define STATUS_SAM_NEED_BOOTKEY_FLOPPY __constant_cpu_to_le32(0xC00002E0)
+#define STATUS_DS_CANT_START __constant_cpu_to_le32(0xC00002E1)
+#define STATUS_DS_INIT_FAILURE __constant_cpu_to_le32(0xC00002E2)
+#define STATUS_SAM_INIT_FAILURE __constant_cpu_to_le32(0xC00002E3)
+#define STATUS_DS_GC_REQUIRED __constant_cpu_to_le32(0xC00002E4)
+#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY __constant_cpu_to_le32(0xC00002E5)
+#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS __constant_cpu_to_le32(0xC00002E6)
+#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC00002E7)
+#define STATUS_MULTIPLE_FAULT_VIOLATION __constant_cpu_to_le32(0xC00002E8)
+#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED __constant_cpu_to_le32(0xC00002E9)
+#define STATUS_CANNOT_MAKE __constant_cpu_to_le32(0xC00002EA)
+#define STATUS_SYSTEM_SHUTDOWN __constant_cpu_to_le32(0xC00002EB)
+#define STATUS_DS_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002EC)
+#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE __constant_cpu_to_le32(0xC00002ED)
+#define STATUS_UNFINISHED_CONTEXT_DELETED __constant_cpu_to_le32(0xC00002EE)
+#define STATUS_NO_TGT_REPLY __constant_cpu_to_le32(0xC00002EF)
+#define STATUS_OBJECTID_NOT_FOUND __constant_cpu_to_le32(0xC00002F0)
+#define STATUS_NO_IP_ADDRESSES __constant_cpu_to_le32(0xC00002F1)
+#define STATUS_WRONG_CREDENTIAL_HANDLE __constant_cpu_to_le32(0xC00002F2)
+#define STATUS_CRYPTO_SYSTEM_INVALID __constant_cpu_to_le32(0xC00002F3)
+#define STATUS_MAX_REFERRALS_EXCEEDED __constant_cpu_to_le32(0xC00002F4)
+#define STATUS_MUST_BE_KDC __constant_cpu_to_le32(0xC00002F5)
+#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED __constant_cpu_to_le32(0xC00002F6)
+#define STATUS_TOO_MANY_PRINCIPALS __constant_cpu_to_le32(0xC00002F7)
+#define STATUS_NO_PA_DATA __constant_cpu_to_le32(0xC00002F8)
+#define STATUS_PKINIT_NAME_MISMATCH __constant_cpu_to_le32(0xC00002F9)
+#define STATUS_SMARTCARD_LOGON_REQUIRED __constant_cpu_to_le32(0xC00002FA)
+#define STATUS_KDC_INVALID_REQUEST __constant_cpu_to_le32(0xC00002FB)
+#define STATUS_KDC_UNABLE_TO_REFER __constant_cpu_to_le32(0xC00002FC)
+#define STATUS_KDC_UNKNOWN_ETYPE __constant_cpu_to_le32(0xC00002FD)
+#define STATUS_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FE)
+#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS __constant_cpu_to_le32(0xC00002FF)
+#define STATUS_NOT_SUPPORTED_ON_SBS __constant_cpu_to_le32(0xC0000300)
+#define STATUS_WMI_GUID_DISCONNECTED __constant_cpu_to_le32(0xC0000301)
+#define STATUS_WMI_ALREADY_DISABLED __constant_cpu_to_le32(0xC0000302)
+#define STATUS_WMI_ALREADY_ENABLED __constant_cpu_to_le32(0xC0000303)
+#define STATUS_MFT_TOO_FRAGMENTED __constant_cpu_to_le32(0xC0000304)
+#define STATUS_COPY_PROTECTION_FAILURE __constant_cpu_to_le32(0xC0000305)
+#define STATUS_CSS_AUTHENTICATION_FAILURE __constant_cpu_to_le32(0xC0000306)
+#define STATUS_CSS_KEY_NOT_PRESENT __constant_cpu_to_le32(0xC0000307)
+#define STATUS_CSS_KEY_NOT_ESTABLISHED __constant_cpu_to_le32(0xC0000308)
+#define STATUS_CSS_SCRAMBLED_SECTOR __constant_cpu_to_le32(0xC0000309)
+#define STATUS_CSS_REGION_MISMATCH __constant_cpu_to_le32(0xC000030A)
+#define STATUS_CSS_RESETS_EXHAUSTED __constant_cpu_to_le32(0xC000030B)
+#define STATUS_PKINIT_FAILURE __constant_cpu_to_le32(0xC0000320)
+#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE __constant_cpu_to_le32(0xC0000321)
+#define STATUS_NO_KERB_KEY __constant_cpu_to_le32(0xC0000322)
+#define STATUS_HOST_DOWN __constant_cpu_to_le32(0xC0000350)
+#define STATUS_UNSUPPORTED_PREAUTH __constant_cpu_to_le32(0xC0000351)
+#define STATUS_EFS_ALG_BLOB_TOO_BIG __constant_cpu_to_le32(0xC0000352)
+#define STATUS_PORT_NOT_SET __constant_cpu_to_le32(0xC0000353)
+#define STATUS_DEBUGGER_INACTIVE __constant_cpu_to_le32(0xC0000354)
+#define STATUS_DS_VERSION_CHECK_FAILURE __constant_cpu_to_le32(0xC0000355)
+#define STATUS_AUDITING_DISABLED __constant_cpu_to_le32(0xC0000356)
+#define STATUS_PRENT4_MACHINE_ACCOUNT __constant_cpu_to_le32(0xC0000357)
+#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER __constant_cpu_to_le32(0xC0000358)
+#define STATUS_INVALID_IMAGE_WIN_32 __constant_cpu_to_le32(0xC0000359)
+#define STATUS_INVALID_IMAGE_WIN_64 __constant_cpu_to_le32(0xC000035A)
+#define STATUS_BAD_BINDINGS __constant_cpu_to_le32(0xC000035B)
+#define STATUS_NETWORK_SESSION_EXPIRED __constant_cpu_to_le32(0xC000035C)
+#define STATUS_APPHELP_BLOCK __constant_cpu_to_le32(0xC000035D)
+#define STATUS_ALL_SIDS_FILTERED __constant_cpu_to_le32(0xC000035E)
+#define STATUS_NOT_SAFE_MODE_DRIVER __constant_cpu_to_le32(0xC000035F)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT __constant_cpu_to_le32(0xC0000361)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH __constant_cpu_to_le32(0xC0000362)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER __constant_cpu_to_le32(0xC0000363)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER __constant_cpu_to_le32(0xC0000364)
+#define STATUS_FAILED_DRIVER_ENTRY __constant_cpu_to_le32(0xC0000365)
+#define STATUS_DEVICE_ENUMERATION_ERROR __constant_cpu_to_le32(0xC0000366)
+#define STATUS_MOUNT_POINT_NOT_RESOLVED __constant_cpu_to_le32(0xC0000368)
+#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER __constant_cpu_to_le32(0xC0000369)
+#define STATUS_MCA_OCCURED __constant_cpu_to_le32(0xC000036A)
+#define STATUS_DRIVER_BLOCKED_CRITICAL __constant_cpu_to_le32(0xC000036B)
+#define STATUS_DRIVER_BLOCKED __constant_cpu_to_le32(0xC000036C)
+#define STATUS_DRIVER_DATABASE_ERROR __constant_cpu_to_le32(0xC000036D)
+#define STATUS_SYSTEM_HIVE_TOO_LARGE __constant_cpu_to_le32(0xC000036E)
+#define STATUS_INVALID_IMPORT_OF_NON_DLL __constant_cpu_to_le32(0xC000036F)
+#define STATUS_NO_SECRETS __constant_cpu_to_le32(0xC0000371)
+#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY __constant_cpu_to_le32(0xC0000372)
+#define STATUS_FAILED_STACK_SWITCH __constant_cpu_to_le32(0xC0000373)
+#define STATUS_HEAP_CORRUPTION __constant_cpu_to_le32(0xC0000374)
+#define STATUS_SMARTCARD_WRONG_PIN __constant_cpu_to_le32(0xC0000380)
+#define STATUS_SMARTCARD_CARD_BLOCKED __constant_cpu_to_le32(0xC0000381)
+#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED __constant_cpu_to_le32(0xC0000382)
+#define STATUS_SMARTCARD_NO_CARD __constant_cpu_to_le32(0xC0000383)
+#define STATUS_SMARTCARD_NO_KEY_CONTAINER __constant_cpu_to_le32(0xC0000384)
+#define STATUS_SMARTCARD_NO_CERTIFICATE __constant_cpu_to_le32(0xC0000385)
+#define STATUS_SMARTCARD_NO_KEYSET __constant_cpu_to_le32(0xC0000386)
+#define STATUS_SMARTCARD_IO_ERROR __constant_cpu_to_le32(0xC0000387)
+#define STATUS_DOWNGRADE_DETECTED __constant_cpu_to_le32(0xC0000388)
+#define STATUS_SMARTCARD_CERT_REVOKED __constant_cpu_to_le32(0xC0000389)
+#define STATUS_ISSUING_CA_UNTRUSTED __constant_cpu_to_le32(0xC000038A)
+#define STATUS_REVOCATION_OFFLINE_C __constant_cpu_to_le32(0xC000038B)
+#define STATUS_PKINIT_CLIENT_FAILURE __constant_cpu_to_le32(0xC000038C)
+#define STATUS_SMARTCARD_CERT_EXPIRED __constant_cpu_to_le32(0xC000038D)
+#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD __constant_cpu_to_le32(0xC000038E)
+#define STATUS_SMARTCARD_SILENT_CONTEXT __constant_cpu_to_le32(0xC000038F)
+#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000401)
+#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000402)
+#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000403)
+#define STATUS_DS_NAME_NOT_UNIQUE __constant_cpu_to_le32(0xC0000404)
+#define STATUS_DS_DUPLICATE_ID_FOUND __constant_cpu_to_le32(0xC0000405)
+#define STATUS_DS_GROUP_CONVERSION_ERROR __constant_cpu_to_le32(0xC0000406)
+#define STATUS_VOLSNAP_PREPARE_HIBERNATE __constant_cpu_to_le32(0xC0000407)
+#define STATUS_USER2USER_REQUIRED __constant_cpu_to_le32(0xC0000408)
+#define STATUS_STACK_BUFFER_OVERRUN __constant_cpu_to_le32(0xC0000409)
+#define STATUS_NO_S4U_PROT_SUPPORT __constant_cpu_to_le32(0xC000040A)
+#define STATUS_CROSSREALM_DELEGATION_FAILURE __constant_cpu_to_le32(0xC000040B)
+#define STATUS_REVOCATION_OFFLINE_KDC __constant_cpu_to_le32(0xC000040C)
+#define STATUS_ISSUING_CA_UNTRUSTED_KDC __constant_cpu_to_le32(0xC000040D)
+#define STATUS_KDC_CERT_EXPIRED __constant_cpu_to_le32(0xC000040E)
+#define STATUS_KDC_CERT_REVOKED __constant_cpu_to_le32(0xC000040F)
+#define STATUS_PARAMETER_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000410)
+#define STATUS_HIBERNATION_FAILURE __constant_cpu_to_le32(0xC0000411)
+#define STATUS_DELAY_LOAD_FAILED __constant_cpu_to_le32(0xC0000412)
+#define STATUS_AUTHENTICATION_FIREWALL_FAILED __constant_cpu_to_le32(0xC0000413)
+#define STATUS_VDM_DISALLOWED __constant_cpu_to_le32(0xC0000414)
+#define STATUS_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC0000415)
+#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE __constant_cpu_to_le32(0xC0000416)
+#define STATUS_INVALID_CRUNTIME_PARAMETER __constant_cpu_to_le32(0xC0000417)
+#define STATUS_NTLM_BLOCKED __constant_cpu_to_le32(0xC0000418)
+#define STATUS_ASSERTION_FAILURE __constant_cpu_to_le32(0xC0000420)
+#define STATUS_VERIFIER_STOP __constant_cpu_to_le32(0xC0000421)
+#define STATUS_CALLBACK_POP_STACK __constant_cpu_to_le32(0xC0000423)
+#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED __constant_cpu_to_le32(0xC0000424)
+#define STATUS_HIVE_UNLOADED __constant_cpu_to_le32(0xC0000425)
+#define STATUS_COMPRESSION_DISABLED __constant_cpu_to_le32(0xC0000426)
+#define STATUS_FILE_SYSTEM_LIMITATION __constant_cpu_to_le32(0xC0000427)
+#define STATUS_INVALID_IMAGE_HASH __constant_cpu_to_le32(0xC0000428)
+#define STATUS_NOT_CAPABLE __constant_cpu_to_le32(0xC0000429)
+#define STATUS_REQUEST_OUT_OF_SEQUENCE __constant_cpu_to_le32(0xC000042A)
+#define STATUS_IMPLEMENTATION_LIMIT __constant_cpu_to_le32(0xC000042B)
+#define STATUS_ELEVATION_REQUIRED __constant_cpu_to_le32(0xC000042C)
+#define STATUS_BEYOND_VDL __constant_cpu_to_le32(0xC0000432)
+#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS __constant_cpu_to_le32(0xC0000433)
+#define STATUS_PTE_CHANGED __constant_cpu_to_le32(0xC0000434)
+#define STATUS_PURGE_FAILED __constant_cpu_to_le32(0xC0000435)
+#define STATUS_CRED_REQUIRES_CONFIRMATION __constant_cpu_to_le32(0xC0000440)
+#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE __constant_cpu_to_le32(0xC0000441)
+#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER __constant_cpu_to_le32(0xC0000442)
+#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000443)
+#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE __constant_cpu_to_le32(0xC0000444)
+#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE __constant_cpu_to_le32(0xC0000445)
+#define STATUS_INVALID_LABEL __constant_cpu_to_le32(0xC0000446)
+#define STATUS_DRIVER_PROCESS_TERMINATED __constant_cpu_to_le32(0xC0000450)
+#define STATUS_AMBIGUOUS_SYSTEM_DEVICE __constant_cpu_to_le32(0xC0000451)
+#define STATUS_SYSTEM_DEVICE_NOT_FOUND __constant_cpu_to_le32(0xC0000452)
+#define STATUS_RESTART_BOOT_APPLICATION __constant_cpu_to_le32(0xC0000453)
+#define STATUS_INVALID_TASK_NAME __constant_cpu_to_le32(0xC0000500)
+#define STATUS_INVALID_TASK_INDEX __constant_cpu_to_le32(0xC0000501)
+#define STATUS_THREAD_ALREADY_IN_TASK __constant_cpu_to_le32(0xC0000502)
+#define STATUS_CALLBACK_BYPASS __constant_cpu_to_le32(0xC0000503)
+#define STATUS_PORT_CLOSED __constant_cpu_to_le32(0xC0000700)
+#define STATUS_MESSAGE_LOST __constant_cpu_to_le32(0xC0000701)
+#define STATUS_INVALID_MESSAGE __constant_cpu_to_le32(0xC0000702)
+#define STATUS_REQUEST_CANCELED __constant_cpu_to_le32(0xC0000703)
+#define STATUS_RECURSIVE_DISPATCH __constant_cpu_to_le32(0xC0000704)
+#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED __constant_cpu_to_le32(0xC0000705)
+#define STATUS_LPC_INVALID_CONNECTION_USAGE __constant_cpu_to_le32(0xC0000706)
+#define STATUS_LPC_REQUESTS_NOT_ALLOWED __constant_cpu_to_le32(0xC0000707)
+#define STATUS_RESOURCE_IN_USE __constant_cpu_to_le32(0xC0000708)
+#define STATUS_HARDWARE_MEMORY_ERROR __constant_cpu_to_le32(0xC0000709)
+#define STATUS_THREADPOOL_HANDLE_EXCEPTION __constant_cpu_to_le32(0xC000070A)
+#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070B)
+#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070C)
+#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070D)
+#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED __constant_cpu_to_le32(0xC000070E)
+#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION __constant_cpu_to_le32(0xC000070F)
+#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000710)
+#define STATUS_APC_RETURNED_WHILE_IMPERSONATING __constant_cpu_to_le32(0xC0000711)
+#define STATUS_PROCESS_IS_PROTECTED __constant_cpu_to_le32(0xC0000712)
+#define STATUS_MCA_EXCEPTION __constant_cpu_to_le32(0xC0000713)
+#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE __constant_cpu_to_le32(0xC0000714)
+#define STATUS_SYMLINK_CLASS_DISABLED __constant_cpu_to_le32(0xC0000715)
+#define STATUS_INVALID_IDN_NORMALIZATION __constant_cpu_to_le32(0xC0000716)
+#define STATUS_NO_UNICODE_TRANSLATION __constant_cpu_to_le32(0xC0000717)
+#define STATUS_ALREADY_REGISTERED __constant_cpu_to_le32(0xC0000718)
+#define STATUS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0000719)
+#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST __constant_cpu_to_le32(0xC000071A)
+#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY __constant_cpu_to_le32(0xC000071B)
+#define STATUS_INVALID_THREAD __constant_cpu_to_le32(0xC000071C)
+#define STATUS_CALLBACK_RETURNED_TRANSACTION __constant_cpu_to_le32(0xC000071D)
+#define STATUS_CALLBACK_RETURNED_LDR_LOCK __constant_cpu_to_le32(0xC000071E)
+#define STATUS_CALLBACK_RETURNED_LANG __constant_cpu_to_le32(0xC000071F)
+#define STATUS_CALLBACK_RETURNED_PRI_BACK __constant_cpu_to_le32(0xC0000720)
+#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY __constant_cpu_to_le32(0xC0000721)
+#define STATUS_DISK_REPAIR_DISABLED __constant_cpu_to_le32(0xC0000800)
+#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS __constant_cpu_to_le32(0xC0000801)
+#define STATUS_DISK_QUOTA_EXCEEDED __constant_cpu_to_le32(0xC0000802)
+#define STATUS_CONTENT_BLOCKED __constant_cpu_to_le32(0xC0000804)
+#define STATUS_BAD_CLUSTERS __constant_cpu_to_le32(0xC0000805)
+#define STATUS_VOLUME_DIRTY __constant_cpu_to_le32(0xC0000806)
+#define STATUS_FILE_CHECKED_OUT __constant_cpu_to_le32(0xC0000901)
+#define STATUS_CHECKOUT_REQUIRED __constant_cpu_to_le32(0xC0000902)
+#define STATUS_BAD_FILE_TYPE __constant_cpu_to_le32(0xC0000903)
+#define STATUS_FILE_TOO_LARGE __constant_cpu_to_le32(0xC0000904)
+#define STATUS_FORMS_AUTH_REQUIRED __constant_cpu_to_le32(0xC0000905)
+#define STATUS_VIRUS_INFECTED __constant_cpu_to_le32(0xC0000906)
+#define STATUS_VIRUS_DELETED __constant_cpu_to_le32(0xC0000907)
+#define STATUS_BAD_MCFG_TABLE __constant_cpu_to_le32(0xC0000908)
+#define STATUS_WOW_ASSERTION __constant_cpu_to_le32(0xC0009898)
+#define STATUS_INVALID_SIGNATURE __constant_cpu_to_le32(0xC000A000)
+#define STATUS_HMAC_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A001)
+#define STATUS_IPSEC_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A010)
+#define STATUS_ND_QUEUE_OVERFLOW __constant_cpu_to_le32(0xC000A011)
+#define STATUS_HOPLIMIT_EXCEEDED __constant_cpu_to_le32(0xC000A012)
+#define STATUS_PROTOCOL_NOT_SUPPORTED __constant_cpu_to_le32(0xC000A013)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED __constant_cpu_to_le32(0xC000A080)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR __constant_cpu_to_le32(0xC000A081)
+#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR __constant_cpu_to_le32(0xC000A082)
+#define STATUS_XML_PARSE_ERROR __constant_cpu_to_le32(0xC000A083)
+#define STATUS_XMLDSIG_ERROR __constant_cpu_to_le32(0xC000A084)
+#define STATUS_WRONG_COMPARTMENT __constant_cpu_to_le32(0xC000A085)
+#define STATUS_AUTHIP_FAILURE __constant_cpu_to_le32(0xC000A086)
+#define DBG_NO_STATE_CHANGE __constant_cpu_to_le32(0xC0010001)
+#define DBG_APP_NOT_IDLE __constant_cpu_to_le32(0xC0010002)
+#define RPC_NT_INVALID_STRING_BINDING __constant_cpu_to_le32(0xC0020001)
+#define RPC_NT_WRONG_KIND_OF_BINDING __constant_cpu_to_le32(0xC0020002)
+#define RPC_NT_INVALID_BINDING __constant_cpu_to_le32(0xC0020003)
+#define RPC_NT_PROTSEQ_NOT_SUPPORTED __constant_cpu_to_le32(0xC0020004)
+#define RPC_NT_INVALID_RPC_PROTSEQ __constant_cpu_to_le32(0xC0020005)
+#define RPC_NT_INVALID_STRING_UUID __constant_cpu_to_le32(0xC0020006)
+#define RPC_NT_INVALID_ENDPOINT_FORMAT __constant_cpu_to_le32(0xC0020007)
+#define RPC_NT_INVALID_NET_ADDR __constant_cpu_to_le32(0xC0020008)
+#define RPC_NT_NO_ENDPOINT_FOUND __constant_cpu_to_le32(0xC0020009)
+#define RPC_NT_INVALID_TIMEOUT __constant_cpu_to_le32(0xC002000A)
+#define RPC_NT_OBJECT_NOT_FOUND __constant_cpu_to_le32(0xC002000B)
+#define RPC_NT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000C)
+#define RPC_NT_TYPE_ALREADY_REGISTERED __constant_cpu_to_le32(0xC002000D)
+#define RPC_NT_ALREADY_LISTENING __constant_cpu_to_le32(0xC002000E)
+#define RPC_NT_NO_PROTSEQS_REGISTERED __constant_cpu_to_le32(0xC002000F)
+#define RPC_NT_NOT_LISTENING __constant_cpu_to_le32(0xC0020010)
+#define RPC_NT_UNKNOWN_MGR_TYPE __constant_cpu_to_le32(0xC0020011)
+#define RPC_NT_UNKNOWN_IF __constant_cpu_to_le32(0xC0020012)
+#define RPC_NT_NO_BINDINGS __constant_cpu_to_le32(0xC0020013)
+#define RPC_NT_NO_PROTSEQS __constant_cpu_to_le32(0xC0020014)
+#define RPC_NT_CANT_CREATE_ENDPOINT __constant_cpu_to_le32(0xC0020015)
+#define RPC_NT_OUT_OF_RESOURCES __constant_cpu_to_le32(0xC0020016)
+#define RPC_NT_SERVER_UNAVAILABLE __constant_cpu_to_le32(0xC0020017)
+#define RPC_NT_SERVER_TOO_BUSY __constant_cpu_to_le32(0xC0020018)
+#define RPC_NT_INVALID_NETWORK_OPTIONS __constant_cpu_to_le32(0xC0020019)
+#define RPC_NT_NO_CALL_ACTIVE __constant_cpu_to_le32(0xC002001A)
+#define RPC_NT_CALL_FAILED __constant_cpu_to_le32(0xC002001B)
+#define RPC_NT_CALL_FAILED_DNE __constant_cpu_to_le32(0xC002001C)
+#define RPC_NT_PROTOCOL_ERROR __constant_cpu_to_le32(0xC002001D)
+#define RPC_NT_UNSUPPORTED_TRANS_SYN __constant_cpu_to_le32(0xC002001F)
+#define RPC_NT_UNSUPPORTED_TYPE __constant_cpu_to_le32(0xC0020021)
+#define RPC_NT_INVALID_TAG __constant_cpu_to_le32(0xC0020022)
+#define RPC_NT_INVALID_BOUND __constant_cpu_to_le32(0xC0020023)
+#define RPC_NT_NO_ENTRY_NAME __constant_cpu_to_le32(0xC0020024)
+#define RPC_NT_INVALID_NAME_SYNTAX __constant_cpu_to_le32(0xC0020025)
+#define RPC_NT_UNSUPPORTED_NAME_SYNTAX __constant_cpu_to_le32(0xC0020026)
+#define RPC_NT_UUID_NO_ADDRESS __constant_cpu_to_le32(0xC0020028)
+#define RPC_NT_DUPLICATE_ENDPOINT __constant_cpu_to_le32(0xC0020029)
+#define RPC_NT_UNKNOWN_AUTHN_TYPE __constant_cpu_to_le32(0xC002002A)
+#define RPC_NT_MAX_CALLS_TOO_SMALL __constant_cpu_to_le32(0xC002002B)
+#define RPC_NT_STRING_TOO_LONG __constant_cpu_to_le32(0xC002002C)
+#define RPC_NT_PROTSEQ_NOT_FOUND __constant_cpu_to_le32(0xC002002D)
+#define RPC_NT_PROCNUM_OUT_OF_RANGE __constant_cpu_to_le32(0xC002002E)
+#define RPC_NT_BINDING_HAS_NO_AUTH __constant_cpu_to_le32(0xC002002F)
+#define RPC_NT_UNKNOWN_AUTHN_SERVICE __constant_cpu_to_le32(0xC0020030)
+#define RPC_NT_UNKNOWN_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020031)
+#define RPC_NT_INVALID_AUTH_IDENTITY __constant_cpu_to_le32(0xC0020032)
+#define RPC_NT_UNKNOWN_AUTHZ_SERVICE __constant_cpu_to_le32(0xC0020033)
+#define EPT_NT_INVALID_ENTRY __constant_cpu_to_le32(0xC0020034)
+#define EPT_NT_CANT_PERFORM_OP __constant_cpu_to_le32(0xC0020035)
+#define EPT_NT_NOT_REGISTERED __constant_cpu_to_le32(0xC0020036)
+#define RPC_NT_NOTHING_TO_EXPORT __constant_cpu_to_le32(0xC0020037)
+#define RPC_NT_INCOMPLETE_NAME __constant_cpu_to_le32(0xC0020038)
+#define RPC_NT_INVALID_VERS_OPTION __constant_cpu_to_le32(0xC0020039)
+#define RPC_NT_NO_MORE_MEMBERS __constant_cpu_to_le32(0xC002003A)
+#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED __constant_cpu_to_le32(0xC002003B)
+#define RPC_NT_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC002003C)
+#define RPC_NT_ENTRY_ALREADY_EXISTS __constant_cpu_to_le32(0xC002003D)
+#define RPC_NT_ENTRY_NOT_FOUND __constant_cpu_to_le32(0xC002003E)
+#define RPC_NT_NAME_SERVICE_UNAVAILABLE __constant_cpu_to_le32(0xC002003F)
+#define RPC_NT_INVALID_NAF_ID __constant_cpu_to_le32(0xC0020040)
+#define RPC_NT_CANNOT_SUPPORT __constant_cpu_to_le32(0xC0020041)
+#define RPC_NT_NO_CONTEXT_AVAILABLE __constant_cpu_to_le32(0xC0020042)
+#define RPC_NT_INTERNAL_ERROR __constant_cpu_to_le32(0xC0020043)
+#define RPC_NT_ZERO_DIVIDE __constant_cpu_to_le32(0xC0020044)
+#define RPC_NT_ADDRESS_ERROR __constant_cpu_to_le32(0xC0020045)
+#define RPC_NT_FP_DIV_ZERO __constant_cpu_to_le32(0xC0020046)
+#define RPC_NT_FP_UNDERFLOW __constant_cpu_to_le32(0xC0020047)
+#define RPC_NT_FP_OVERFLOW __constant_cpu_to_le32(0xC0020048)
+#define RPC_NT_CALL_IN_PROGRESS __constant_cpu_to_le32(0xC0020049)
+#define RPC_NT_NO_MORE_BINDINGS __constant_cpu_to_le32(0xC002004A)
+#define RPC_NT_GROUP_MEMBER_NOT_FOUND __constant_cpu_to_le32(0xC002004B)
+#define EPT_NT_CANT_CREATE __constant_cpu_to_le32(0xC002004C)
+#define RPC_NT_INVALID_OBJECT __constant_cpu_to_le32(0xC002004D)
+#define RPC_NT_NO_INTERFACES __constant_cpu_to_le32(0xC002004F)
+#define RPC_NT_CALL_CANCELLED __constant_cpu_to_le32(0xC0020050)
+#define RPC_NT_BINDING_INCOMPLETE __constant_cpu_to_le32(0xC0020051)
+#define RPC_NT_COMM_FAILURE __constant_cpu_to_le32(0xC0020052)
+#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL __constant_cpu_to_le32(0xC0020053)
+#define RPC_NT_NO_PRINC_NAME __constant_cpu_to_le32(0xC0020054)
+#define RPC_NT_NOT_RPC_ERROR __constant_cpu_to_le32(0xC0020055)
+#define RPC_NT_SEC_PKG_ERROR __constant_cpu_to_le32(0xC0020057)
+#define RPC_NT_NOT_CANCELLED __constant_cpu_to_le32(0xC0020058)
+#define RPC_NT_INVALID_ASYNC_HANDLE __constant_cpu_to_le32(0xC0020062)
+#define RPC_NT_INVALID_ASYNC_CALL __constant_cpu_to_le32(0xC0020063)
+#define RPC_NT_PROXY_ACCESS_DENIED __constant_cpu_to_le32(0xC0020064)
+#define RPC_NT_NO_MORE_ENTRIES __constant_cpu_to_le32(0xC0030001)
+#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL __constant_cpu_to_le32(0xC0030002)
+#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE __constant_cpu_to_le32(0xC0030003)
+#define RPC_NT_SS_IN_NULL_CONTEXT __constant_cpu_to_le32(0xC0030004)
+#define RPC_NT_SS_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0030005)
+#define RPC_NT_SS_CONTEXT_DAMAGED __constant_cpu_to_le32(0xC0030006)
+#define RPC_NT_SS_HANDLES_MISMATCH __constant_cpu_to_le32(0xC0030007)
+#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE __constant_cpu_to_le32(0xC0030008)
+#define RPC_NT_NULL_REF_POINTER __constant_cpu_to_le32(0xC0030009)
+#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE __constant_cpu_to_le32(0xC003000A)
+#define RPC_NT_BYTE_COUNT_TOO_SMALL __constant_cpu_to_le32(0xC003000B)
+#define RPC_NT_BAD_STUB_DATA __constant_cpu_to_le32(0xC003000C)
+#define RPC_NT_INVALID_ES_ACTION __constant_cpu_to_le32(0xC0030059)
+#define RPC_NT_WRONG_ES_VERSION __constant_cpu_to_le32(0xC003005A)
+#define RPC_NT_WRONG_STUB_VERSION __constant_cpu_to_le32(0xC003005B)
+#define RPC_NT_INVALID_PIPE_OBJECT __constant_cpu_to_le32(0xC003005C)
+#define RPC_NT_INVALID_PIPE_OPERATION __constant_cpu_to_le32(0xC003005D)
+#define RPC_NT_WRONG_PIPE_VERSION __constant_cpu_to_le32(0xC003005E)
+#define RPC_NT_PIPE_CLOSED __constant_cpu_to_le32(0xC003005F)
+#define RPC_NT_PIPE_DISCIPLINE_ERROR __constant_cpu_to_le32(0xC0030060)
+#define RPC_NT_PIPE_EMPTY __constant_cpu_to_le32(0xC0030061)
+#define STATUS_PNP_BAD_MPS_TABLE __constant_cpu_to_le32(0xC0040035)
+#define STATUS_PNP_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040036)
+#define STATUS_PNP_IRQ_TRANSLATION_FAILED __constant_cpu_to_le32(0xC0040037)
+#define STATUS_PNP_INVALID_ID __constant_cpu_to_le32(0xC0040038)
+#define STATUS_IO_REISSUE_AS_CACHED __constant_cpu_to_le32(0xC0040039)
+#define STATUS_CTX_WINSTATION_NAME_INVALID __constant_cpu_to_le32(0xC00A0001)
+#define STATUS_CTX_INVALID_PD __constant_cpu_to_le32(0xC00A0002)
+#define STATUS_CTX_PD_NOT_FOUND __constant_cpu_to_le32(0xC00A0003)
+#define STATUS_CTX_CLOSE_PENDING __constant_cpu_to_le32(0xC00A0006)
+#define STATUS_CTX_NO_OUTBUF __constant_cpu_to_le32(0xC00A0007)
+#define STATUS_CTX_MODEM_INF_NOT_FOUND __constant_cpu_to_le32(0xC00A0008)
+#define STATUS_CTX_INVALID_MODEMNAME __constant_cpu_to_le32(0xC00A0009)
+#define STATUS_CTX_RESPONSE_ERROR __constant_cpu_to_le32(0xC00A000A)
+#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT __constant_cpu_to_le32(0xC00A000B)
+#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER __constant_cpu_to_le32(0xC00A000C)
+#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE __constant_cpu_to_le32(0xC00A000D)
+#define STATUS_CTX_MODEM_RESPONSE_BUSY __constant_cpu_to_le32(0xC00A000E)
+#define STATUS_CTX_MODEM_RESPONSE_VOICE __constant_cpu_to_le32(0xC00A000F)
+#define STATUS_CTX_TD_ERROR __constant_cpu_to_le32(0xC00A0010)
+#define STATUS_CTX_LICENSE_CLIENT_INVALID __constant_cpu_to_le32(0xC00A0012)
+#define STATUS_CTX_LICENSE_NOT_AVAILABLE __constant_cpu_to_le32(0xC00A0013)
+#define STATUS_CTX_LICENSE_EXPIRED __constant_cpu_to_le32(0xC00A0014)
+#define STATUS_CTX_WINSTATION_NOT_FOUND __constant_cpu_to_le32(0xC00A0015)
+#define STATUS_CTX_WINSTATION_NAME_COLLISION __constant_cpu_to_le32(0xC00A0016)
+#define STATUS_CTX_WINSTATION_BUSY __constant_cpu_to_le32(0xC00A0017)
+#define STATUS_CTX_BAD_VIDEO_MODE __constant_cpu_to_le32(0xC00A0018)
+#define STATUS_CTX_GRAPHICS_INVALID __constant_cpu_to_le32(0xC00A0022)
+#define STATUS_CTX_NOT_CONSOLE __constant_cpu_to_le32(0xC00A0024)
+#define STATUS_CTX_CLIENT_QUERY_TIMEOUT __constant_cpu_to_le32(0xC00A0026)
+#define STATUS_CTX_CONSOLE_DISCONNECT __constant_cpu_to_le32(0xC00A0027)
+#define STATUS_CTX_CONSOLE_CONNECT __constant_cpu_to_le32(0xC00A0028)
+#define STATUS_CTX_SHADOW_DENIED __constant_cpu_to_le32(0xC00A002A)
+#define STATUS_CTX_WINSTATION_ACCESS_DENIED __constant_cpu_to_le32(0xC00A002B)
+#define STATUS_CTX_INVALID_WD __constant_cpu_to_le32(0xC00A002E)
+#define STATUS_CTX_WD_NOT_FOUND __constant_cpu_to_le32(0xC00A002F)
+#define STATUS_CTX_SHADOW_INVALID __constant_cpu_to_le32(0xC00A0030)
+#define STATUS_CTX_SHADOW_DISABLED __constant_cpu_to_le32(0xC00A0031)
+#define STATUS_RDP_PROTOCOL_ERROR __constant_cpu_to_le32(0xC00A0032)
+#define STATUS_CTX_CLIENT_LICENSE_NOT_SET __constant_cpu_to_le32(0xC00A0033)
+#define STATUS_CTX_CLIENT_LICENSE_IN_USE __constant_cpu_to_le32(0xC00A0034)
+#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE __constant_cpu_to_le32(0xC00A0035)
+#define STATUS_CTX_SHADOW_NOT_RUNNING __constant_cpu_to_le32(0xC00A0036)
+#define STATUS_CTX_LOGON_DISABLED __constant_cpu_to_le32(0xC00A0037)
+#define STATUS_CTX_SECURITY_LAYER_ERROR __constant_cpu_to_le32(0xC00A0038)
+#define STATUS_TS_INCOMPATIBLE_SESSIONS __constant_cpu_to_le32(0xC00A0039)
+#define STATUS_MUI_FILE_NOT_FOUND __constant_cpu_to_le32(0xC00B0001)
+#define STATUS_MUI_INVALID_FILE __constant_cpu_to_le32(0xC00B0002)
+#define STATUS_MUI_INVALID_RC_CONFIG __constant_cpu_to_le32(0xC00B0003)
+#define STATUS_MUI_INVALID_LOCALE_NAME __constant_cpu_to_le32(0xC00B0004)
+#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME __constant_cpu_to_le32(0xC00B0005)
+#define STATUS_MUI_FILE_NOT_LOADED __constant_cpu_to_le32(0xC00B0006)
+#define STATUS_RESOURCE_ENUM_USER_STOP __constant_cpu_to_le32(0xC00B0007)
+#define STATUS_CLUSTER_INVALID_NODE __constant_cpu_to_le32(0xC0130001)
+#define STATUS_CLUSTER_NODE_EXISTS __constant_cpu_to_le32(0xC0130002)
+#define STATUS_CLUSTER_JOIN_IN_PROGRESS __constant_cpu_to_le32(0xC0130003)
+#define STATUS_CLUSTER_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130004)
+#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND __constant_cpu_to_le32(0xC0130005)
+#define STATUS_CLUSTER_NETWORK_EXISTS __constant_cpu_to_le32(0xC0130006)
+#define STATUS_CLUSTER_NETWORK_NOT_FOUND __constant_cpu_to_le32(0xC0130007)
+#define STATUS_CLUSTER_NETINTERFACE_EXISTS __constant_cpu_to_le32(0xC0130008)
+#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC0130009)
+#define STATUS_CLUSTER_INVALID_REQUEST __constant_cpu_to_le32(0xC013000A)
+#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER __constant_cpu_to_le32(0xC013000B)
+#define STATUS_CLUSTER_NODE_DOWN __constant_cpu_to_le32(0xC013000C)
+#define STATUS_CLUSTER_NODE_UNREACHABLE __constant_cpu_to_le32(0xC013000D)
+#define STATUS_CLUSTER_NODE_NOT_MEMBER __constant_cpu_to_le32(0xC013000E)
+#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC013000F)
+#define STATUS_CLUSTER_INVALID_NETWORK __constant_cpu_to_le32(0xC0130010)
+#define STATUS_CLUSTER_NO_NET_ADAPTERS __constant_cpu_to_le32(0xC0130011)
+#define STATUS_CLUSTER_NODE_UP __constant_cpu_to_le32(0xC0130012)
+#define STATUS_CLUSTER_NODE_PAUSED __constant_cpu_to_le32(0xC0130013)
+#define STATUS_CLUSTER_NODE_NOT_PAUSED __constant_cpu_to_le32(0xC0130014)
+#define STATUS_CLUSTER_NO_SECURITY_CONTEXT __constant_cpu_to_le32(0xC0130015)
+#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL __constant_cpu_to_le32(0xC0130016)
+#define STATUS_CLUSTER_POISONED __constant_cpu_to_le32(0xC0130017)
+#define STATUS_ACPI_INVALID_OPCODE __constant_cpu_to_le32(0xC0140001)
+#define STATUS_ACPI_STACK_OVERFLOW __constant_cpu_to_le32(0xC0140002)
+#define STATUS_ACPI_ASSERT_FAILED __constant_cpu_to_le32(0xC0140003)
+#define STATUS_ACPI_INVALID_INDEX __constant_cpu_to_le32(0xC0140004)
+#define STATUS_ACPI_INVALID_ARGUMENT __constant_cpu_to_le32(0xC0140005)
+#define STATUS_ACPI_FATAL __constant_cpu_to_le32(0xC0140006)
+#define STATUS_ACPI_INVALID_SUPERNAME __constant_cpu_to_le32(0xC0140007)
+#define STATUS_ACPI_INVALID_ARGTYPE __constant_cpu_to_le32(0xC0140008)
+#define STATUS_ACPI_INVALID_OBJTYPE __constant_cpu_to_le32(0xC0140009)
+#define STATUS_ACPI_INVALID_TARGETTYPE __constant_cpu_to_le32(0xC014000A)
+#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT __constant_cpu_to_le32(0xC014000B)
+#define STATUS_ACPI_ADDRESS_NOT_MAPPED __constant_cpu_to_le32(0xC014000C)
+#define STATUS_ACPI_INVALID_EVENTTYPE __constant_cpu_to_le32(0xC014000D)
+#define STATUS_ACPI_HANDLER_COLLISION __constant_cpu_to_le32(0xC014000E)
+#define STATUS_ACPI_INVALID_DATA __constant_cpu_to_le32(0xC014000F)
+#define STATUS_ACPI_INVALID_REGION __constant_cpu_to_le32(0xC0140010)
+#define STATUS_ACPI_INVALID_ACCESS_SIZE __constant_cpu_to_le32(0xC0140011)
+#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK __constant_cpu_to_le32(0xC0140012)
+#define STATUS_ACPI_ALREADY_INITIALIZED __constant_cpu_to_le32(0xC0140013)
+#define STATUS_ACPI_NOT_INITIALIZED __constant_cpu_to_le32(0xC0140014)
+#define STATUS_ACPI_INVALID_MUTEX_LEVEL __constant_cpu_to_le32(0xC0140015)
+#define STATUS_ACPI_MUTEX_NOT_OWNED __constant_cpu_to_le32(0xC0140016)
+#define STATUS_ACPI_MUTEX_NOT_OWNER __constant_cpu_to_le32(0xC0140017)
+#define STATUS_ACPI_RS_ACCESS __constant_cpu_to_le32(0xC0140018)
+#define STATUS_ACPI_INVALID_TABLE __constant_cpu_to_le32(0xC0140019)
+#define STATUS_ACPI_REG_HANDLER_FAILED __constant_cpu_to_le32(0xC0140020)
+#define STATUS_ACPI_POWER_REQUEST_FAILED __constant_cpu_to_le32(0xC0140021)
+#define STATUS_SXS_SECTION_NOT_FOUND __constant_cpu_to_le32(0xC0150001)
+#define STATUS_SXS_CANT_GEN_ACTCTX __constant_cpu_to_le32(0xC0150002)
+#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT __constant_cpu_to_le32(0xC0150003)
+#define STATUS_SXS_ASSEMBLY_NOT_FOUND __constant_cpu_to_le32(0xC0150004)
+#define STATUS_SXS_MANIFEST_FORMAT_ERROR __constant_cpu_to_le32(0xC0150005)
+#define STATUS_SXS_MANIFEST_PARSE_ERROR __constant_cpu_to_le32(0xC0150006)
+#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED __constant_cpu_to_le32(0xC0150007)
+#define STATUS_SXS_KEY_NOT_FOUND __constant_cpu_to_le32(0xC0150008)
+#define STATUS_SXS_VERSION_CONFLICT __constant_cpu_to_le32(0xC0150009)
+#define STATUS_SXS_WRONG_SECTION_TYPE __constant_cpu_to_le32(0xC015000A)
+#define STATUS_SXS_THREAD_QUERIES_DISABLED __constant_cpu_to_le32(0xC015000B)
+#define STATUS_SXS_ASSEMBLY_MISSING __constant_cpu_to_le32(0xC015000C)
+#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET __constant_cpu_to_le32(0xC015000E)
+#define STATUS_SXS_EARLY_DEACTIVATION __constant_cpu_to_le32(0xC015000F)
+#define STATUS_SXS_INVALID_DEACTIVATION __constant_cpu_to_le32(0xC0150010)
+#define STATUS_SXS_MULTIPLE_DEACTIVATION __constant_cpu_to_le32(0xC0150011)
+#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY __constant_cpu_to_le32(0xC0150012)
+#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED __constant_cpu_to_le32(0xC0150013)
+#define STATUS_SXS_CORRUPT_ACTIVATION_STACK __constant_cpu_to_le32(0xC0150014)
+#define STATUS_SXS_CORRUPTION __constant_cpu_to_le32(0xC0150015)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE __constant_cpu_to_le32(0xC0150016)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME __constant_cpu_to_le32(0xC0150017)
+#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE __constant_cpu_to_le32(0xC0150018)
+#define STATUS_SXS_IDENTITY_PARSE_ERROR __constant_cpu_to_le32(0xC0150019)
+#define STATUS_SXS_COMPONENT_STORE_CORRUPT __constant_cpu_to_le32(0xC015001A)
+#define STATUS_SXS_FILE_HASH_MISMATCH __constant_cpu_to_le32(0xC015001B)
+#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT __constant_cpu_to_le32(0xC015001C)
+#define STATUS_SXS_IDENTITIES_DIFFERENT __constant_cpu_to_le32(0xC015001D)
+#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT __constant_cpu_to_le32(0xC015001E)
+#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY __constant_cpu_to_le32(0xC015001F)
+#define STATUS_ADVANCED_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150020)
+#define STATUS_XML_ENCODING_MISMATCH __constant_cpu_to_le32(0xC0150021)
+#define STATUS_SXS_MANIFEST_TOO_BIG __constant_cpu_to_le32(0xC0150022)
+#define STATUS_SXS_SETTING_NOT_REGISTERED __constant_cpu_to_le32(0xC0150023)
+#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE __constant_cpu_to_le32(0xC0150024)
+#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED __constant_cpu_to_le32(0xC0150025)
+#define STATUS_GENERIC_COMMAND_FAILED __constant_cpu_to_le32(0xC0150026)
+#define STATUS_SXS_FILE_HASH_MISSING __constant_cpu_to_le32(0xC0150027)
+#define STATUS_TRANSACTIONAL_CONFLICT __constant_cpu_to_le32(0xC0190001)
+#define STATUS_INVALID_TRANSACTION __constant_cpu_to_le32(0xC0190002)
+#define STATUS_TRANSACTION_NOT_ACTIVE __constant_cpu_to_le32(0xC0190003)
+#define STATUS_TM_INITIALIZATION_FAILED __constant_cpu_to_le32(0xC0190004)
+#define STATUS_RM_NOT_ACTIVE __constant_cpu_to_le32(0xC0190005)
+#define STATUS_RM_METADATA_CORRUPT __constant_cpu_to_le32(0xC0190006)
+#define STATUS_TRANSACTION_NOT_JOINED __constant_cpu_to_le32(0xC0190007)
+#define STATUS_DIRECTORY_NOT_RM __constant_cpu_to_le32(0xC0190008)
+#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC019000A)
+#define STATUS_LOG_RESIZE_INVALID_SIZE __constant_cpu_to_le32(0xC019000B)
+#define STATUS_REMOTE_FILE_VERSION_MISMATCH __constant_cpu_to_le32(0xC019000C)
+#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS __constant_cpu_to_le32(0xC019000F)
+#define STATUS_TRANSACTION_PROPAGATION_FAILED __constant_cpu_to_le32(0xC0190010)
+#define STATUS_CRM_PROTOCOL_NOT_FOUND __constant_cpu_to_le32(0xC0190011)
+#define STATUS_TRANSACTION_SUPERIOR_EXISTS __constant_cpu_to_le32(0xC0190012)
+#define STATUS_TRANSACTION_REQUEST_NOT_VALID __constant_cpu_to_le32(0xC0190013)
+#define STATUS_TRANSACTION_NOT_REQUESTED __constant_cpu_to_le32(0xC0190014)
+#define STATUS_TRANSACTION_ALREADY_ABORTED __constant_cpu_to_le32(0xC0190015)
+#define STATUS_TRANSACTION_ALREADY_COMMITTED __constant_cpu_to_le32(0xC0190016)
+#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER __constant_cpu_to_le32(0xC0190017)
+#define STATUS_CURRENT_TRANSACTION_NOT_VALID __constant_cpu_to_le32(0xC0190018)
+#define STATUS_LOG_GROWTH_FAILED __constant_cpu_to_le32(0xC0190019)
+#define STATUS_OBJECT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC0190021)
+#define STATUS_STREAM_MINIVERSION_NOT_FOUND __constant_cpu_to_le32(0xC0190022)
+#define STATUS_STREAM_MINIVERSION_NOT_VALID __constant_cpu_to_le32(0xC0190023)
+#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION __constant_cpu_to_le32(0xC0190024)
+#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT __constant_cpu_to_le32(0xC0190025)
+#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS __constant_cpu_to_le32(0xC0190026)
+#define STATUS_HANDLE_NO_LONGER_VALID __constant_cpu_to_le32(0xC0190028)
+#define STATUS_LOG_CORRUPTION_DETECTED __constant_cpu_to_le32(0xC0190030)
+#define STATUS_RM_DISCONNECTED __constant_cpu_to_le32(0xC0190032)
+#define STATUS_ENLISTMENT_NOT_SUPERIOR __constant_cpu_to_le32(0xC0190033)
+#define STATUS_FILE_IDENTITY_NOT_PERSISTENT __constant_cpu_to_le32(0xC0190036)
+#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY __constant_cpu_to_le32(0xC0190037)
+#define STATUS_CANT_CROSS_RM_BOUNDARY __constant_cpu_to_le32(0xC0190038)
+#define STATUS_TXF_DIR_NOT_EMPTY __constant_cpu_to_le32(0xC0190039)
+#define STATUS_INDOUBT_TRANSACTIONS_EXIST __constant_cpu_to_le32(0xC019003A)
+#define STATUS_TM_VOLATILE __constant_cpu_to_le32(0xC019003B)
+#define STATUS_ROLLBACK_TIMER_EXPIRED __constant_cpu_to_le32(0xC019003C)
+#define STATUS_TXF_ATTRIBUTE_CORRUPT __constant_cpu_to_le32(0xC019003D)
+#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC019003E)
+#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED __constant_cpu_to_le32(0xC019003F)
+#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE __constant_cpu_to_le32(0xC0190040)
+#define STATUS_TRANSACTION_REQUIRED_PROMOTION __constant_cpu_to_le32(0xC0190043)
+#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION __constant_cpu_to_le32(0xC0190044)
+#define STATUS_TRANSACTIONS_NOT_FROZEN __constant_cpu_to_le32(0xC0190045)
+#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS __constant_cpu_to_le32(0xC0190046)
+#define STATUS_NOT_SNAPSHOT_VOLUME __constant_cpu_to_le32(0xC0190047)
+#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES __constant_cpu_to_le32(0xC0190048)
+#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190049)
+#define STATUS_TM_IDENTITY_MISMATCH __constant_cpu_to_le32(0xC019004A)
+#define STATUS_FLOATED_SECTION __constant_cpu_to_le32(0xC019004B)
+#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK __constant_cpu_to_le32(0xC019004C)
+#define STATUS_CANNOT_ABORT_TRANSACTIONS __constant_cpu_to_le32(0xC019004D)
+#define STATUS_TRANSACTION_NOT_FOUND __constant_cpu_to_le32(0xC019004E)
+#define STATUS_RESOURCEMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC019004F)
+#define STATUS_ENLISTMENT_NOT_FOUND __constant_cpu_to_le32(0xC0190050)
+#define STATUS_TRANSACTIONMANAGER_NOT_FOUND __constant_cpu_to_le32(0xC0190051)
+#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE __constant_cpu_to_le32(0xC0190052)
+#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION __constant_cpu_to_le32(0xC0190053)
+#define STATUS_TRANSACTION_NOT_ROOT __constant_cpu_to_le32(0xC0190054)
+#define STATUS_TRANSACTION_OBJECT_EXPIRED __constant_cpu_to_le32(0xC0190055)
+#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION __constant_cpu_to_le32(0xC0190056)
+#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED __constant_cpu_to_le32(0xC0190057)
+#define STATUS_TRANSACTION_RECORD_TOO_LONG __constant_cpu_to_le32(0xC0190058)
+#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION __constant_cpu_to_le32(0xC0190059)
+#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION __constant_cpu_to_le32(0xC019005A)
+#define STATUS_TRANSACTION_INTEGRITY_VIOLATED __constant_cpu_to_le32(0xC019005B)
+#define STATUS_LOG_SECTOR_INVALID __constant_cpu_to_le32(0xC01A0001)
+#define STATUS_LOG_SECTOR_PARITY_INVALID __constant_cpu_to_le32(0xC01A0002)
+#define STATUS_LOG_SECTOR_REMAPPED __constant_cpu_to_le32(0xC01A0003)
+#define STATUS_LOG_BLOCK_INCOMPLETE __constant_cpu_to_le32(0xC01A0004)
+#define STATUS_LOG_INVALID_RANGE __constant_cpu_to_le32(0xC01A0005)
+#define STATUS_LOG_BLOCKS_EXHAUSTED __constant_cpu_to_le32(0xC01A0006)
+#define STATUS_LOG_READ_CONTEXT_INVALID __constant_cpu_to_le32(0xC01A0007)
+#define STATUS_LOG_RESTART_INVALID __constant_cpu_to_le32(0xC01A0008)
+#define STATUS_LOG_BLOCK_VERSION __constant_cpu_to_le32(0xC01A0009)
+#define STATUS_LOG_BLOCK_INVALID __constant_cpu_to_le32(0xC01A000A)
+#define STATUS_LOG_READ_MODE_INVALID __constant_cpu_to_le32(0xC01A000B)
+#define STATUS_LOG_METADATA_CORRUPT __constant_cpu_to_le32(0xC01A000D)
+#define STATUS_LOG_METADATA_INVALID __constant_cpu_to_le32(0xC01A000E)
+#define STATUS_LOG_METADATA_INCONSISTENT __constant_cpu_to_le32(0xC01A000F)
+#define STATUS_LOG_RESERVATION_INVALID __constant_cpu_to_le32(0xC01A0010)
+#define STATUS_LOG_CANT_DELETE __constant_cpu_to_le32(0xC01A0011)
+#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED __constant_cpu_to_le32(0xC01A0012)
+#define STATUS_LOG_START_OF_LOG __constant_cpu_to_le32(0xC01A0013)
+#define STATUS_LOG_POLICY_ALREADY_INSTALLED __constant_cpu_to_le32(0xC01A0014)
+#define STATUS_LOG_POLICY_NOT_INSTALLED __constant_cpu_to_le32(0xC01A0015)
+#define STATUS_LOG_POLICY_INVALID __constant_cpu_to_le32(0xC01A0016)
+#define STATUS_LOG_POLICY_CONFLICT __constant_cpu_to_le32(0xC01A0017)
+#define STATUS_LOG_PINNED_ARCHIVE_TAIL __constant_cpu_to_le32(0xC01A0018)
+#define STATUS_LOG_RECORD_NONEXISTENT __constant_cpu_to_le32(0xC01A0019)
+#define STATUS_LOG_RECORDS_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001A)
+#define STATUS_LOG_SPACE_RESERVED_INVALID __constant_cpu_to_le32(0xC01A001B)
+#define STATUS_LOG_TAIL_INVALID __constant_cpu_to_le32(0xC01A001C)
+#define STATUS_LOG_FULL __constant_cpu_to_le32(0xC01A001D)
+#define STATUS_LOG_MULTIPLEXED __constant_cpu_to_le32(0xC01A001E)
+#define STATUS_LOG_DEDICATED __constant_cpu_to_le32(0xC01A001F)
+#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS __constant_cpu_to_le32(0xC01A0020)
+#define STATUS_LOG_ARCHIVE_IN_PROGRESS __constant_cpu_to_le32(0xC01A0021)
+#define STATUS_LOG_EPHEMERAL __constant_cpu_to_le32(0xC01A0022)
+#define STATUS_LOG_NOT_ENOUGH_CONTAINERS __constant_cpu_to_le32(0xC01A0023)
+#define STATUS_LOG_CLIENT_ALREADY_REGISTERED __constant_cpu_to_le32(0xC01A0024)
+#define STATUS_LOG_CLIENT_NOT_REGISTERED __constant_cpu_to_le32(0xC01A0025)
+#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS __constant_cpu_to_le32(0xC01A0026)
+#define STATUS_LOG_CONTAINER_READ_FAILED __constant_cpu_to_le32(0xC01A0027)
+#define STATUS_LOG_CONTAINER_WRITE_FAILED __constant_cpu_to_le32(0xC01A0028)
+#define STATUS_LOG_CONTAINER_OPEN_FAILED __constant_cpu_to_le32(0xC01A0029)
+#define STATUS_LOG_CONTAINER_STATE_INVALID __constant_cpu_to_le32(0xC01A002A)
+#define STATUS_LOG_STATE_INVALID __constant_cpu_to_le32(0xC01A002B)
+#define STATUS_LOG_PINNED __constant_cpu_to_le32(0xC01A002C)
+#define STATUS_LOG_METADATA_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002D)
+#define STATUS_LOG_INCONSISTENT_SECURITY __constant_cpu_to_le32(0xC01A002E)
+#define STATUS_LOG_APPENDED_FLUSH_FAILED __constant_cpu_to_le32(0xC01A002F)
+#define STATUS_LOG_PINNED_RESERVATION __constant_cpu_to_le32(0xC01A0030)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD __constant_cpu_to_le32(0xC01B00EA)
+#define STATUS_FLT_NO_HANDLER_DEFINED __constant_cpu_to_le32(0xC01C0001)
+#define STATUS_FLT_CONTEXT_ALREADY_DEFINED __constant_cpu_to_le32(0xC01C0002)
+#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST __constant_cpu_to_le32(0xC01C0003)
+#define STATUS_FLT_DISALLOW_FAST_IO __constant_cpu_to_le32(0xC01C0004)
+#define STATUS_FLT_INVALID_NAME_REQUEST __constant_cpu_to_le32(0xC01C0005)
+#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION __constant_cpu_to_le32(0xC01C0006)
+#define STATUS_FLT_NOT_INITIALIZED __constant_cpu_to_le32(0xC01C0007)
+#define STATUS_FLT_FILTER_NOT_READY __constant_cpu_to_le32(0xC01C0008)
+#define STATUS_FLT_POST_OPERATION_CLEANUP __constant_cpu_to_le32(0xC01C0009)
+#define STATUS_FLT_INTERNAL_ERROR __constant_cpu_to_le32(0xC01C000A)
+#define STATUS_FLT_DELETING_OBJECT __constant_cpu_to_le32(0xC01C000B)
+#define STATUS_FLT_MUST_BE_NONPAGED_POOL __constant_cpu_to_le32(0xC01C000C)
+#define STATUS_FLT_DUPLICATE_ENTRY __constant_cpu_to_le32(0xC01C000D)
+#define STATUS_FLT_CBDQ_DISABLED __constant_cpu_to_le32(0xC01C000E)
+#define STATUS_FLT_DO_NOT_ATTACH __constant_cpu_to_le32(0xC01C000F)
+#define STATUS_FLT_DO_NOT_DETACH __constant_cpu_to_le32(0xC01C0010)
+#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION __constant_cpu_to_le32(0xC01C0011)
+#define STATUS_FLT_INSTANCE_NAME_COLLISION __constant_cpu_to_le32(0xC01C0012)
+#define STATUS_FLT_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC01C0013)
+#define STATUS_FLT_VOLUME_NOT_FOUND __constant_cpu_to_le32(0xC01C0014)
+#define STATUS_FLT_INSTANCE_NOT_FOUND __constant_cpu_to_le32(0xC01C0015)
+#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND __constant_cpu_to_le32(0xC01C0016)
+#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION __constant_cpu_to_le32(0xC01C0017)
+#define STATUS_FLT_NAME_CACHE_MISS __constant_cpu_to_le32(0xC01C0018)
+#define STATUS_FLT_NO_DEVICE_OBJECT __constant_cpu_to_le32(0xC01C0019)
+#define STATUS_FLT_VOLUME_ALREADY_MOUNTED __constant_cpu_to_le32(0xC01C001A)
+#define STATUS_FLT_ALREADY_ENLISTED __constant_cpu_to_le32(0xC01C001B)
+#define STATUS_FLT_CONTEXT_ALREADY_LINKED __constant_cpu_to_le32(0xC01C001C)
+#define STATUS_FLT_NO_WAITER_FOR_REPLY __constant_cpu_to_le32(0xC01C0020)
+#define STATUS_MONITOR_NO_DESCRIPTOR __constant_cpu_to_le32(0xC01D0001)
+#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT __constant_cpu_to_le32(0xC01D0002)
+#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM __constant_cpu_to_le32(0xC01D0003)
+#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0004)
+#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED __constant_cpu_to_le32(0xC01D0005)
+#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0006)
+#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK __constant_cpu_to_le32(0xC01D0007)
+#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA __constant_cpu_to_le32(0xC01D0008)
+#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK __constant_cpu_to_le32(0xC01D0009)
+#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER __constant_cpu_to_le32(0xC01E0000)
+#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER __constant_cpu_to_le32(0xC01E0001)
+#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER __constant_cpu_to_le32(0xC01E0002)
+#define STATUS_GRAPHICS_ADAPTER_WAS_RESET __constant_cpu_to_le32(0xC01E0003)
+#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL __constant_cpu_to_le32(0xC01E0004)
+#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED __constant_cpu_to_le32(0xC01E0005)
+#define STATUS_GRAPHICS_PRESENT_OCCLUDED __constant_cpu_to_le32(0xC01E0006)
+#define STATUS_GRAPHICS_PRESENT_DENIED __constant_cpu_to_le32(0xC01E0007)
+#define STATUS_GRAPHICS_CANNOTCOLORCONVERT __constant_cpu_to_le32(0xC01E0008)
+#define STATUS_GRAPHICS_NO_VIDEO_MEMORY __constant_cpu_to_le32(0xC01E0100)
+#define STATUS_GRAPHICS_CANT_LOCK_MEMORY __constant_cpu_to_le32(0xC01E0101)
+#define STATUS_GRAPHICS_ALLOCATION_BUSY __constant_cpu_to_le32(0xC01E0102)
+#define STATUS_GRAPHICS_TOO_MANY_REFERENCES __constant_cpu_to_le32(0xC01E0103)
+#define STATUS_GRAPHICS_TRY_AGAIN_LATER __constant_cpu_to_le32(0xC01E0104)
+#define STATUS_GRAPHICS_TRY_AGAIN_NOW __constant_cpu_to_le32(0xC01E0105)
+#define STATUS_GRAPHICS_ALLOCATION_INVALID __constant_cpu_to_le32(0xC01E0106)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE __constant_cpu_to_le32(0xC01E0107)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED __constant_cpu_to_le32(0xC01E0108)
+#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION __constant_cpu_to_le32(0xC01E0109)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE __constant_cpu_to_le32(0xC01E0110)
+#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION __constant_cpu_to_le32(0xC01E0111)
+#define STATUS_GRAPHICS_ALLOCATION_CLOSED __constant_cpu_to_le32(0xC01E0112)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE __constant_cpu_to_le32(0xC01E0113)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE __constant_cpu_to_le32(0xC01E0114)
+#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE __constant_cpu_to_le32(0xC01E0115)
+#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST __constant_cpu_to_le32(0xC01E0116)
+#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE __constant_cpu_to_le32(0xC01E0200)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0300)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0301)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0302)
+#define STATUS_GRAPHICS_INVALID_VIDPN __constant_cpu_to_le32(0xC01E0303)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE __constant_cpu_to_le32(0xC01E0304)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET __constant_cpu_to_le32(0xC01E0305)
+#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0306)
+#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET __constant_cpu_to_le32(0xC01E0308)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET __constant_cpu_to_le32(0xC01E0309)
+#define STATUS_GRAPHICS_INVALID_FREQUENCY __constant_cpu_to_le32(0xC01E030A)
+#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION __constant_cpu_to_le32(0xC01E030B)
+#define STATUS_GRAPHICS_INVALID_TOTAL_REGION __constant_cpu_to_le32(0xC01E030C)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE __constant_cpu_to_le32(0xC01E0310)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE __constant_cpu_to_le32(0xC01E0311)
+#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET __constant_cpu_to_le32(0xC01E0312)
+#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0313)
+#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET __constant_cpu_to_le32(0xC01E0314)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET __constant_cpu_to_le32(0xC01E0315)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET __constant_cpu_to_le32(0xC01E0316)
+#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0317)
+#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E0318)
+#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH __constant_cpu_to_le32(0xC01E0319)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E031A)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET __constant_cpu_to_le32(0xC01E031B)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE __constant_cpu_to_le32(0xC01E031C)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET __constant_cpu_to_le32(0xC01E031D)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E031F)
+#define STATUS_GRAPHICS_STALE_MODESET __constant_cpu_to_le32(0xC01E0320)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET __constant_cpu_to_le32(0xC01E0321)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE __constant_cpu_to_le32(0xC01E0322)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN __constant_cpu_to_le32(0xC01E0323)
+#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0324)
+#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION __constant_cpu_to_le32(0xC01E0325)
+#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES __constant_cpu_to_le32(0xC01E0326)
+#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0327)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE __constant_cpu_to_le32(0xC01E0328)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET __constant_cpu_to_le32(0xC01E0329)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET __constant_cpu_to_le32(0xC01E032A)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR __constant_cpu_to_le32(0xC01E032B)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET __constant_cpu_to_le32(0xC01E032C)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET __constant_cpu_to_le32(0xC01E032D)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E032E)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE __constant_cpu_to_le32(0xC01E032F)
+#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED __constant_cpu_to_le32(0xC01E0330)
+#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0331)
+#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE __constant_cpu_to_le32(0xC01E0332)
+#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET __constant_cpu_to_le32(0xC01E0333)
+#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER __constant_cpu_to_le32(0xC01E0334)
+#define STATUS_GRAPHICS_NO_VIDPNMGR __constant_cpu_to_le32(0xC01E0335)
+#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0336)
+#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY __constant_cpu_to_le32(0xC01E0337)
+#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED __constant_cpu_to_le32(0xC01E0338)
+#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0339)
+#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE __constant_cpu_to_le32(0xC01E033A)
+#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE __constant_cpu_to_le32(0xC01E033B)
+#define STATUS_GRAPHICS_INVALID_STRIDE __constant_cpu_to_le32(0xC01E033C)
+#define STATUS_GRAPHICS_INVALID_PIXELFORMAT __constant_cpu_to_le32(0xC01E033D)
+#define STATUS_GRAPHICS_INVALID_COLORBASIS __constant_cpu_to_le32(0xC01E033E)
+#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE __constant_cpu_to_le32(0xC01E033F)
+#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY __constant_cpu_to_le32(0xC01E0340)
+#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT __constant_cpu_to_le32(0xC01E0341)
+#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE __constant_cpu_to_le32(0xC01E0342)
+#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN __constant_cpu_to_le32(0xC01E0343)
+#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL __constant_cpu_to_le32(0xC01E0344)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION __constant_cpu_to_le32(0xC01E0345)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0346)
+#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP __constant_cpu_to_le32(0xC01E0347)
+#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0348)
+#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0349)
+#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET __constant_cpu_to_le32(0xC01E034A)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON __constant_cpu_to_le32(0xC01E034D)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE __constant_cpu_to_le32(0xC01E034E)
+#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE __constant_cpu_to_le32(0xC01E034F)
+#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS __constant_cpu_to_le32(0xC01E0350)
+#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING __constant_cpu_to_le32(0xC01E0352)
+#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED __constant_cpu_to_le32(0xC01E0353)
+#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS __constant_cpu_to_le32(0xC01E0354)
+#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT __constant_cpu_to_le32(0xC01E0355)
+#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM __constant_cpu_to_le32(0xC01E0356)
+#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN __constant_cpu_to_le32(0xC01E0357)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT __constant_cpu_to_le32(0xC01E0358)
+#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED __constant_cpu_to_le32(0xC01E0359)
+#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION __constant_cpu_to_le32(0xC01E035A)
+#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE __constant_cpu_to_le32(0xC01E035B)
+#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET __constant_cpu_to_le32(0xC01E035C)
+#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED __constant_cpu_to_le32(0xC01E0400)
+#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0401)
+#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER __constant_cpu_to_le32(0xC01E0430)
+#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0431)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED __constant_cpu_to_le32(0xC01E0432)
+#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY __constant_cpu_to_le32(0xC01E0433)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED __constant_cpu_to_le32(0xC01E0434)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON __constant_cpu_to_le32(0xC01E0435)
+#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE __constant_cpu_to_le32(0xC01E0436)
+#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER __constant_cpu_to_le32(0xC01E0438)
+#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED __constant_cpu_to_le32(0xC01E043B)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS __constant_cpu_to_le32(0xC01E051C)
+#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST __constant_cpu_to_le32(0xC01E051D)
+#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E051E)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS __constant_cpu_to_le32(0xC01E051F)
+#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0520)
+#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST __constant_cpu_to_le32(0xC01E0521)
+#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0500)
+#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0501)
+#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0502)
+#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS __constant_cpu_to_le32(0xC01E0503)
+#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E0504)
+#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST __constant_cpu_to_le32(0xC01E0505)
+#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E0506)
+#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E0507)
+#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0508)
+#define STATUS_GRAPHICS_OPM_INVALID_POINTER __constant_cpu_to_le32(0xC01E050A)
+#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E050B)
+#define STATUS_GRAPHICS_OPM_INVALID_HANDLE __constant_cpu_to_le32(0xC01E050C)
+#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E050D)
+#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH __constant_cpu_to_le32(0xC01E050E)
+#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED __constant_cpu_to_le32(0xC01E050F)
+#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED __constant_cpu_to_le32(0xC01E0510)
+#define STATUS_GRAPHICS_PVP_HFS_FAILED __constant_cpu_to_le32(0xC01E0511)
+#define STATUS_GRAPHICS_OPM_INVALID_SRM __constant_cpu_to_le32(0xC01E0512)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP __constant_cpu_to_le32(0xC01E0513)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP __constant_cpu_to_le32(0xC01E0514)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA __constant_cpu_to_le32(0xC01E0515)
+#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET __constant_cpu_to_le32(0xC01E0516)
+#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH __constant_cpu_to_le32(0xC01E0517)
+#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE __constant_cpu_to_le32(0xC01E0518)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E051A)
+#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E051B)
+#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0580)
+#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST __constant_cpu_to_le32(0xC01E0581)
+#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA __constant_cpu_to_le32(0xC01E0582)
+#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA __constant_cpu_to_le32(0xC01E0583)
+#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E0584)
+#define STATUS_GRAPHICS_DDCCI_INVALID_DATA __constant_cpu_to_le32(0xC01E0585)
+#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE __constant_cpu_to_le32(0xC01E0586)
+#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING __constant_cpu_to_le32(0xC01E0587)
+#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E0588)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND __constant_cpu_to_le32(0xC01E0589)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH __constant_cpu_to_le32(0xC01E058A)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM __constant_cpu_to_le32(0xC01E058B)
+#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE __constant_cpu_to_le32(0xC01E058C)
+#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS __constant_cpu_to_le32(0xC01E058D)
+#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED __constant_cpu_to_le32(0xC01E05E0)
+#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME __constant_cpu_to_le32(0xC01E05E1)
+#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP __constant_cpu_to_le32(0xC01E05E2)
+#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED __constant_cpu_to_le32(0xC01E05E3)
+#define STATUS_GRAPHICS_INVALID_POINTER __constant_cpu_to_le32(0xC01E05E4)
+#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE __constant_cpu_to_le32(0xC01E05E5)
+#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL __constant_cpu_to_le32(0xC01E05E6)
+#define STATUS_GRAPHICS_INTERNAL_ERROR __constant_cpu_to_le32(0xC01E05E7)
+#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS __constant_cpu_to_le32(0xC01E05E8)
+#define STATUS_FVE_LOCKED_VOLUME __constant_cpu_to_le32(0xC0210000)
+#define STATUS_FVE_NOT_ENCRYPTED __constant_cpu_to_le32(0xC0210001)
+#define STATUS_FVE_BAD_INFORMATION __constant_cpu_to_le32(0xC0210002)
+#define STATUS_FVE_TOO_SMALL __constant_cpu_to_le32(0xC0210003)
+#define STATUS_FVE_FAILED_WRONG_FS __constant_cpu_to_le32(0xC0210004)
+#define STATUS_FVE_FAILED_BAD_FS __constant_cpu_to_le32(0xC0210005)
+#define STATUS_FVE_FS_NOT_EXTENDED __constant_cpu_to_le32(0xC0210006)
+#define STATUS_FVE_FS_MOUNTED __constant_cpu_to_le32(0xC0210007)
+#define STATUS_FVE_NO_LICENSE __constant_cpu_to_le32(0xC0210008)
+#define STATUS_FVE_ACTION_NOT_ALLOWED __constant_cpu_to_le32(0xC0210009)
+#define STATUS_FVE_BAD_DATA __constant_cpu_to_le32(0xC021000A)
+#define STATUS_FVE_VOLUME_NOT_BOUND __constant_cpu_to_le32(0xC021000B)
+#define STATUS_FVE_NOT_DATA_VOLUME __constant_cpu_to_le32(0xC021000C)
+#define STATUS_FVE_CONV_READ_ERROR __constant_cpu_to_le32(0xC021000D)
+#define STATUS_FVE_CONV_WRITE_ERROR __constant_cpu_to_le32(0xC021000E)
+#define STATUS_FVE_OVERLAPPED_UPDATE __constant_cpu_to_le32(0xC021000F)
+#define STATUS_FVE_FAILED_SECTOR_SIZE __constant_cpu_to_le32(0xC0210010)
+#define STATUS_FVE_FAILED_AUTHENTICATION __constant_cpu_to_le32(0xC0210011)
+#define STATUS_FVE_NOT_OS_VOLUME __constant_cpu_to_le32(0xC0210012)
+#define STATUS_FVE_KEYFILE_NOT_FOUND __constant_cpu_to_le32(0xC0210013)
+#define STATUS_FVE_KEYFILE_INVALID __constant_cpu_to_le32(0xC0210014)
+#define STATUS_FVE_KEYFILE_NO_VMK __constant_cpu_to_le32(0xC0210015)
+#define STATUS_FVE_TPM_DISABLED __constant_cpu_to_le32(0xC0210016)
+#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO __constant_cpu_to_le32(0xC0210017)
+#define STATUS_FVE_TPM_INVALID_PCR __constant_cpu_to_le32(0xC0210018)
+#define STATUS_FVE_TPM_NO_VMK __constant_cpu_to_le32(0xC0210019)
+#define STATUS_FVE_PIN_INVALID __constant_cpu_to_le32(0xC021001A)
+#define STATUS_FVE_AUTH_INVALID_APPLICATION __constant_cpu_to_le32(0xC021001B)
+#define STATUS_FVE_AUTH_INVALID_CONFIG __constant_cpu_to_le32(0xC021001C)
+#define STATUS_FVE_DEBUGGER_ENABLED __constant_cpu_to_le32(0xC021001D)
+#define STATUS_FVE_DRY_RUN_FAILED __constant_cpu_to_le32(0xC021001E)
+#define STATUS_FVE_BAD_METADATA_POINTER __constant_cpu_to_le32(0xC021001F)
+#define STATUS_FVE_OLD_METADATA_COPY __constant_cpu_to_le32(0xC0210020)
+#define STATUS_FVE_REBOOT_REQUIRED __constant_cpu_to_le32(0xC0210021)
+#define STATUS_FVE_RAW_ACCESS __constant_cpu_to_le32(0xC0210022)
+#define STATUS_FVE_RAW_BLOCKED __constant_cpu_to_le32(0xC0210023)
+#define STATUS_FWP_CALLOUT_NOT_FOUND __constant_cpu_to_le32(0xC0220001)
+#define STATUS_FWP_CONDITION_NOT_FOUND __constant_cpu_to_le32(0xC0220002)
+#define STATUS_FWP_FILTER_NOT_FOUND __constant_cpu_to_le32(0xC0220003)
+#define STATUS_FWP_LAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220004)
+#define STATUS_FWP_PROVIDER_NOT_FOUND __constant_cpu_to_le32(0xC0220005)
+#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND __constant_cpu_to_le32(0xC0220006)
+#define STATUS_FWP_SUBLAYER_NOT_FOUND __constant_cpu_to_le32(0xC0220007)
+#define STATUS_FWP_NOT_FOUND __constant_cpu_to_le32(0xC0220008)
+#define STATUS_FWP_ALREADY_EXISTS __constant_cpu_to_le32(0xC0220009)
+#define STATUS_FWP_IN_USE __constant_cpu_to_le32(0xC022000A)
+#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS __constant_cpu_to_le32(0xC022000B)
+#define STATUS_FWP_WRONG_SESSION __constant_cpu_to_le32(0xC022000C)
+#define STATUS_FWP_NO_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000D)
+#define STATUS_FWP_TXN_IN_PROGRESS __constant_cpu_to_le32(0xC022000E)
+#define STATUS_FWP_TXN_ABORTED __constant_cpu_to_le32(0xC022000F)
+#define STATUS_FWP_SESSION_ABORTED __constant_cpu_to_le32(0xC0220010)
+#define STATUS_FWP_INCOMPATIBLE_TXN __constant_cpu_to_le32(0xC0220011)
+#define STATUS_FWP_TIMEOUT __constant_cpu_to_le32(0xC0220012)
+#define STATUS_FWP_NET_EVENTS_DISABLED __constant_cpu_to_le32(0xC0220013)
+#define STATUS_FWP_INCOMPATIBLE_LAYER __constant_cpu_to_le32(0xC0220014)
+#define STATUS_FWP_KM_CLIENTS_ONLY __constant_cpu_to_le32(0xC0220015)
+#define STATUS_FWP_LIFETIME_MISMATCH __constant_cpu_to_le32(0xC0220016)
+#define STATUS_FWP_BUILTIN_OBJECT __constant_cpu_to_le32(0xC0220017)
+#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS __constant_cpu_to_le32(0xC0220018)
+#define STATUS_FWP_TOO_MANY_CALLOUTS __constant_cpu_to_le32(0xC0220018)
+#define STATUS_FWP_NOTIFICATION_DROPPED __constant_cpu_to_le32(0xC0220019)
+#define STATUS_FWP_TRAFFIC_MISMATCH __constant_cpu_to_le32(0xC022001A)
+#define STATUS_FWP_INCOMPATIBLE_SA_STATE __constant_cpu_to_le32(0xC022001B)
+#define STATUS_FWP_NULL_POINTER __constant_cpu_to_le32(0xC022001C)
+#define STATUS_FWP_INVALID_ENUMERATOR __constant_cpu_to_le32(0xC022001D)
+#define STATUS_FWP_INVALID_FLAGS __constant_cpu_to_le32(0xC022001E)
+#define STATUS_FWP_INVALID_NET_MASK __constant_cpu_to_le32(0xC022001F)
+#define STATUS_FWP_INVALID_RANGE __constant_cpu_to_le32(0xC0220020)
+#define STATUS_FWP_INVALID_INTERVAL __constant_cpu_to_le32(0xC0220021)
+#define STATUS_FWP_ZERO_LENGTH_ARRAY __constant_cpu_to_le32(0xC0220022)
+#define STATUS_FWP_NULL_DISPLAY_NAME __constant_cpu_to_le32(0xC0220023)
+#define STATUS_FWP_INVALID_ACTION_TYPE __constant_cpu_to_le32(0xC0220024)
+#define STATUS_FWP_INVALID_WEIGHT __constant_cpu_to_le32(0xC0220025)
+#define STATUS_FWP_MATCH_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220026)
+#define STATUS_FWP_TYPE_MISMATCH __constant_cpu_to_le32(0xC0220027)
+#define STATUS_FWP_OUT_OF_BOUNDS __constant_cpu_to_le32(0xC0220028)
+#define STATUS_FWP_RESERVED __constant_cpu_to_le32(0xC0220029)
+#define STATUS_FWP_DUPLICATE_CONDITION __constant_cpu_to_le32(0xC022002A)
+#define STATUS_FWP_DUPLICATE_KEYMOD __constant_cpu_to_le32(0xC022002B)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002C)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER __constant_cpu_to_le32(0xC022002D)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER __constant_cpu_to_le32(0xC022002E)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT __constant_cpu_to_le32(0xC022002F)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD __constant_cpu_to_le32(0xC0220030)
+#define STATUS_FWP_INCOMPATIBLE_DH_GROUP __constant_cpu_to_le32(0xC0220031)
+#define STATUS_FWP_EM_NOT_SUPPORTED __constant_cpu_to_le32(0xC0220032)
+#define STATUS_FWP_NEVER_MATCH __constant_cpu_to_le32(0xC0220033)
+#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH __constant_cpu_to_le32(0xC0220034)
+#define STATUS_FWP_INVALID_PARAMETER __constant_cpu_to_le32(0xC0220035)
+#define STATUS_FWP_TOO_MANY_SUBLAYERS __constant_cpu_to_le32(0xC0220036)
+#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED __constant_cpu_to_le32(0xC0220037)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG __constant_cpu_to_le32(0xC0220038)
+#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG __constant_cpu_to_le32(0xC0220039)
+#define STATUS_FWP_TCPIP_NOT_READY __constant_cpu_to_le32(0xC0220100)
+#define STATUS_FWP_INJECT_HANDLE_CLOSING __constant_cpu_to_le32(0xC0220101)
+#define STATUS_FWP_INJECT_HANDLE_STALE __constant_cpu_to_le32(0xC0220102)
+#define STATUS_FWP_CANNOT_PEND __constant_cpu_to_le32(0xC0220103)
+#define STATUS_NDIS_CLOSING __constant_cpu_to_le32(0xC0230002)
+#define STATUS_NDIS_BAD_VERSION __constant_cpu_to_le32(0xC0230004)
+#define STATUS_NDIS_BAD_CHARACTERISTICS __constant_cpu_to_le32(0xC0230005)
+#define STATUS_NDIS_ADAPTER_NOT_FOUND __constant_cpu_to_le32(0xC0230006)
+#define STATUS_NDIS_OPEN_FAILED __constant_cpu_to_le32(0xC0230007)
+#define STATUS_NDIS_DEVICE_FAILED __constant_cpu_to_le32(0xC0230008)
+#define STATUS_NDIS_MULTICAST_FULL __constant_cpu_to_le32(0xC0230009)
+#define STATUS_NDIS_MULTICAST_EXISTS __constant_cpu_to_le32(0xC023000A)
+#define STATUS_NDIS_MULTICAST_NOT_FOUND __constant_cpu_to_le32(0xC023000B)
+#define STATUS_NDIS_REQUEST_ABORTED __constant_cpu_to_le32(0xC023000C)
+#define STATUS_NDIS_RESET_IN_PROGRESS __constant_cpu_to_le32(0xC023000D)
+#define STATUS_NDIS_INVALID_PACKET __constant_cpu_to_le32(0xC023000F)
+#define STATUS_NDIS_INVALID_DEVICE_REQUEST __constant_cpu_to_le32(0xC0230010)
+#define STATUS_NDIS_ADAPTER_NOT_READY __constant_cpu_to_le32(0xC0230011)
+#define STATUS_NDIS_INVALID_LENGTH __constant_cpu_to_le32(0xC0230014)
+#define STATUS_NDIS_INVALID_DATA __constant_cpu_to_le32(0xC0230015)
+#define STATUS_NDIS_BUFFER_TOO_SHORT __constant_cpu_to_le32(0xC0230016)
+#define STATUS_NDIS_INVALID_OID __constant_cpu_to_le32(0xC0230017)
+#define STATUS_NDIS_ADAPTER_REMOVED __constant_cpu_to_le32(0xC0230018)
+#define STATUS_NDIS_UNSUPPORTED_MEDIA __constant_cpu_to_le32(0xC0230019)
+#define STATUS_NDIS_GROUP_ADDRESS_IN_USE __constant_cpu_to_le32(0xC023001A)
+#define STATUS_NDIS_FILE_NOT_FOUND __constant_cpu_to_le32(0xC023001B)
+#define STATUS_NDIS_ERROR_READING_FILE __constant_cpu_to_le32(0xC023001C)
+#define STATUS_NDIS_ALREADY_MAPPED __constant_cpu_to_le32(0xC023001D)
+#define STATUS_NDIS_RESOURCE_CONFLICT __constant_cpu_to_le32(0xC023001E)
+#define STATUS_NDIS_MEDIA_DISCONNECTED __constant_cpu_to_le32(0xC023001F)
+#define STATUS_NDIS_INVALID_ADDRESS __constant_cpu_to_le32(0xC0230022)
+#define STATUS_NDIS_PAUSED __constant_cpu_to_le32(0xC023002A)
+#define STATUS_NDIS_INTERFACE_NOT_FOUND __constant_cpu_to_le32(0xC023002B)
+#define STATUS_NDIS_UNSUPPORTED_REVISION __constant_cpu_to_le32(0xC023002C)
+#define STATUS_NDIS_INVALID_PORT __constant_cpu_to_le32(0xC023002D)
+#define STATUS_NDIS_INVALID_PORT_STATE __constant_cpu_to_le32(0xC023002E)
+#define STATUS_NDIS_LOW_POWER_STATE __constant_cpu_to_le32(0xC023002F)
+#define STATUS_NDIS_NOT_SUPPORTED __constant_cpu_to_le32(0xC02300BB)
+#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED __constant_cpu_to_le32(0xC0232000)
+#define STATUS_NDIS_DOT11_MEDIA_IN_USE __constant_cpu_to_le32(0xC0232001)
+#define STATUS_NDIS_DOT11_POWER_STATE_INVALID __constant_cpu_to_le32(0xC0232002)
+#define STATUS_IPSEC_BAD_SPI __constant_cpu_to_le32(0xC0360001)
+#define STATUS_IPSEC_SA_LIFETIME_EXPIRED __constant_cpu_to_le32(0xC0360002)
+#define STATUS_IPSEC_WRONG_SA __constant_cpu_to_le32(0xC0360003)
+#define STATUS_IPSEC_REPLAY_CHECK_FAILED __constant_cpu_to_le32(0xC0360004)
+#define STATUS_IPSEC_INVALID_PACKET __constant_cpu_to_le32(0xC0360005)
+#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED __constant_cpu_to_le32(0xC0360006)
+#define STATUS_IPSEC_CLEAR_TEXT_DROP __constant_cpu_to_le32(0xC0360007)
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
new file mode 100644
index 00000000000..59c748ce872
--- /dev/null
+++ b/fs/cifs/smb2transport.c
@@ -0,0 +1,607 @@
+/*
+ *   fs/cifs/smb2transport.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2002, 2011
+ *                 Etersoft, 2012
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Jeremy Allison (jra@samba.org) 2006
+ *              Pavel Shilovsky (pshilovsky@samba.org) 2012
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <asm/processor.h>
+#include <linux/mempool.h>
+#include <linux/highmem.h>
+#include "smb2pdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+#include "cifs_debug.h"
+#include "smb2status.h"
+#include "smb2glob.h"
+
+static int
+smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	if (server->secmech.sdeschmacsha256 != NULL)
+		return 0; /* already allocated */
+
+	server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	if (IS_ERR(server->secmech.hmacsha256)) {
+		cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
+		rc = PTR_ERR(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacsha256);
+	server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacsha256) {
+		crypto_free_shash(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
+	server->secmech.sdeschmacsha256->shash.flags = 0x0;
+
+	return 0;
+}
+
+static int
+smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	unsigned int size;
+	int rc;
+
+	if (server->secmech.sdesccmacaes != NULL)
+		return 0;  /* already allocated */
+
+	rc = smb2_crypto_shash_allocate(server);
+	if (rc)
+		return rc;
+
+	server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+	if (IS_ERR(server->secmech.cmacaes)) {
+		cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+		kfree(server->secmech.sdeschmacsha256);
+		server->secmech.sdeschmacsha256 = NULL;
+		crypto_free_shash(server->secmech.hmacsha256);
+		server->secmech.hmacsha256 = NULL;
+		rc = PTR_ERR(server->secmech.cmacaes);
+		server->secmech.cmacaes = NULL;
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.cmacaes);
+	server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdesccmacaes) {
+		cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+		kfree(server->secmech.sdeschmacsha256);
+		server->secmech.sdeschmacsha256 = NULL;
+		crypto_free_shash(server->secmech.hmacsha256);
+		crypto_free_shash(server->secmech.cmacaes);
+		server->secmech.hmacsha256 = NULL;
+		server->secmech.cmacaes = NULL;
+		return -ENOMEM;
+	}
+	server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+	server->secmech.sdesccmacaes->shash.flags = 0x0;
+
+	return 0;
+}
+
+static struct cifs_ses *
+smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
+{
+	struct cifs_ses *ses;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (ses->Suid != smb2hdr->SessionId)
+			continue;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	return NULL;
+}
+
+
+int
+smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	int i, rc;
+	unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
+	unsigned char *sigptr = smb2_signature;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct cifs_ses *ses;
+
+	ses = smb2_find_smb_ses(smb2_pdu, server);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
+
+	memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
+	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+	rc = smb2_crypto_shash_allocate(server);
+	if (rc) {
+		cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_setkey(server->secmech.hmacsha256,
+		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init sha256", __func__);
+		return rc;
+	}
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cifs_dbg(VFS, "null iovec entry\n");
+			return -EIO;
+		}
+		/*
+		 * The first entry includes a length field (which does not get
+		 * signed that occupies the first 4 bytes before the header).
+		 */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdeschmacsha256->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else {
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdeschmacsha256->shash,
+				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Could not update with payload\n",
+				 __func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+				sigptr);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+
+	memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+	return rc;
+}
+
+int
+generate_smb3signingkey(struct cifs_ses *ses)
+{
+	unsigned char zero = 0x0;
+	__u8 i[4] = {0, 0, 0, 1};
+	__u8 L[4] = {0, 0, 0, 128};
+	int rc = 0;
+	unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+	unsigned char *hashptr = prfhash;
+
+	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+	memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+
+	rc = smb3_crypto_shash_allocate(ses->server);
+	if (rc) {
+		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_setkey(ses->server->secmech.hmacsha256,
+		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				i, 4);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				"SMB2AESCMAC", 12);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				&zero, 1);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				"SmbSign", 8);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
+				L, 4);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash,
+				hashptr);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+
+smb3signkey_ret:
+	return rc;
+}
+
+int
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	int i;
+	int rc = 0;
+	unsigned char smb3_signature[SMB2_CMACAES_SIZE];
+	unsigned char *sigptr = smb3_signature;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct cifs_ses *ses;
+
+	ses = smb2_find_smb_ses(smb2_pdu, server);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
+
+	memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
+	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+	rc = crypto_shash_setkey(server->secmech.cmacaes,
+		ses->smb3signingkey, SMB2_CMACAES_SIZE);
+
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+		return rc;
+	}
+
+	/*
+	 * we already allocate sdesccmacaes when we init smb3 signing key,
+	 * so unlike smb2 case we do not have to check here if secmech are
+	 * initialized
+	 */
+	rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+		return rc;
+	}
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cifs_dbg(VFS, "null iovec entry");
+			return -EIO;
+		}
+		/*
+		 * The first entry includes a length field (which does not get
+		 * signed that occupies the first 4 bytes before the header).
+		 */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdesccmacaes->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else {
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdesccmacaes->shash,
+				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
+							__func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdesccmacaes->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
+						sigptr);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
+
+	memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+	return rc;
+}
+
+/* must be called with server->srv_mutex held */
+static int
+smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base;
+
+	if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) ||
+	    server->tcpStatus == CifsNeedNegotiate)
+		return rc;
+
+	if (!server->session_estab) {
+		strncpy(smb2_pdu->Signature, "BSRSPYL", 8);
+		return rc;
+	}
+
+	rc = server->ops->calc_signature(rqst, server);
+
+	return rc;
+}
+
+int
+smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+	unsigned int rc;
+	char server_response_sig[16];
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+
+	if ((smb2_pdu->Command == SMB2_NEGOTIATE) ||
+	    (smb2_pdu->Command == SMB2_SESSION_SETUP) ||
+	    (smb2_pdu->Command == SMB2_OPLOCK_BREAK) ||
+	    (!server->session_estab))
+		return 0;
+
+	/*
+	 * BB what if signatures are supposed to be on for session but
+	 * server does not send one? BB
+	 */
+
+	/* Do not need to verify session setups with signature "BSRSPYL " */
+	if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0)
+		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+			 smb2_pdu->Command);
+
+	/*
+	 * Save off the origiginal signature so we can modify the smb and check
+	 * our calculated signature against what the server sent.
+	 */
+	memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE);
+
+	memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
+
+	mutex_lock(&server->srv_mutex);
+	rc = server->ops->calc_signature(rqst, server);
+	mutex_unlock(&server->srv_mutex);
+
+	if (rc)
+		return rc;
+
+	if (memcmp(server_response_sig, smb2_pdu->Signature,
+		   SMB2_SIGNATURE_SIZE))
+		return -EACCES;
+	else
+		return 0;
+}
+
+/*
+ * Set message id for the request. Should be called after wait_for_free_request
+ * and when srv_mutex is held.
+ */
+static inline void
+smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
+{
+	hdr->MessageId = get_next_mid64(server);
+}
+
+static struct mid_q_entry *
+smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
+		     struct TCP_Server_Info *server)
+{
+	struct mid_q_entry *temp;
+
+	if (server == NULL) {
+		cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
+		return NULL;
+	}
+
+	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
+	if (temp == NULL)
+		return temp;
+	else {
+		memset(temp, 0, sizeof(struct mid_q_entry));
+		temp->mid = smb_buffer->MessageId;	/* always LE */
+		temp->pid = current->pid;
+		temp->command = smb_buffer->Command;	/* Always LE */
+		temp->when_alloc = jiffies;
+		temp->server = server;
+
+		/*
+		 * The default is for the mid to be synchronous, so the
+		 * default callback just wakes up the current task.
+		 */
+		temp->callback = cifs_wake_up_task;
+		temp->callback_data = current;
+	}
+
+	atomic_inc(&midCount);
+	temp->mid_state = MID_REQUEST_ALLOCATED;
+	return temp;
+}
+
+static int
+smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
+		   struct mid_q_entry **mid)
+{
+	if (ses->server->tcpStatus == CifsExiting)
+		return -ENOENT;
+
+	if (ses->server->tcpStatus == CifsNeedReconnect) {
+		cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
+		return -EAGAIN;
+	}
+
+	if (ses->status == CifsNew) {
+		if ((buf->Command != SMB2_SESSION_SETUP) &&
+		    (buf->Command != SMB2_NEGOTIATE))
+			return -EAGAIN;
+		/* else ok - we are setting up session */
+	}
+
+	if (ses->status == CifsExiting) {
+		if (buf->Command != SMB2_LOGOFF)
+			return -EAGAIN;
+		/* else ok - we are shutting down the session */
+	}
+
+	*mid = smb2_mid_entry_alloc(buf, ses->server);
+	if (*mid == NULL)
+		return -ENOMEM;
+	spin_lock(&GlobalMid_Lock);
+	list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
+	spin_unlock(&GlobalMid_Lock);
+	return 0;
+}
+
+int
+smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		   bool log_error)
+{
+	unsigned int len = get_rfc1002_length(mid->resp_buf);
+	struct kvec iov;
+	struct smb_rqst rqst = { .rq_iov = &iov,
+				 .rq_nvec = 1 };
+
+	iov.iov_base = (char *)mid->resp_buf;
+	iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4;
+
+	dump_smb(mid->resp_buf, min_t(u32, 80, len));
+	/* convert the length into a more usable form */
+	if (len > 24 && server->sign) {
+		int rc;
+
+		rc = smb2_verify_signature(&rqst, server);
+		if (rc)
+			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				 rc);
+	}
+
+	return map_smb2_to_linux_error(mid->resp_buf, log_error);
+}
+
+struct mid_q_entry *
+smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	smb2_seq_num_into_buf(ses->server, hdr);
+
+	rc = smb2_get_mid_entry(ses, hdr, &mid);
+	if (rc)
+		return ERR_PTR(rc);
+	rc = smb2_sign_rqst(rqst, ses->server);
+	if (rc) {
+		cifs_delete_mid(mid);
+		return ERR_PTR(rc);
+	}
+	return mid;
+}
+
+struct mid_q_entry *
+smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	smb2_seq_num_into_buf(server, hdr);
+
+	mid = smb2_mid_entry_alloc(hdr, server);
+	if (mid == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rc = smb2_sign_rqst(rqst, server);
+	if (rc) {
+		DeleteMidQEntry(mid);
+		return ERR_PTR(rc);
+	}
+
+	return mid;
+}
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
deleted file mode 100644
index 04721485925..00000000000
--- a/fs/cifs/smbdes.c
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-
-   a partial implementation of DES designed for use in the
-   SMB authentication protocol
-
-   Copyright (C) Andrew Tridgell 1998
-   Modified by Steve French (sfrench@us.ibm.com) 2002,2004
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/* NOTES:
-
-   This code makes no attempt to be fast! In fact, it is a very
-   slow implementation
-
-   This code is NOT a complete DES implementation. It implements only
-   the minimum necessary for SMB authentication, as used by all SMB
-   products (including every copy of Microsoft Windows95 ever sold)
-
-   In particular, it can only do a unchained forward DES pass. This
-   means it is not possible to use this code for encryption/decryption
-   of data, instead it is only useful as a "hash" algorithm.
-
-   There is no entry point into this code that allows normal DES operation.
-
-   I believe this means that this code does not come under ITAR
-   regulations but this is NOT a legal opinion. If you are concerned
-   about the applicability of ITAR regulations to this code then you
-   should confirm it for yourself (and maybe let me know if you come
-   up with a different answer to the one above)
-*/
-#include <linux/slab.h>
-#define uchar unsigned char
-
-static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
-	1, 58, 50, 42, 34, 26, 18,
-	10, 2, 59, 51, 43, 35, 27,
-	19, 11, 3, 60, 52, 44, 36,
-	63, 55, 47, 39, 31, 23, 15,
-	7, 62, 54, 46, 38, 30, 22,
-	14, 6, 61, 53, 45, 37, 29,
-	21, 13, 5, 28, 20, 12, 4
-};
-
-static uchar perm2[48] = { 14, 17, 11, 24, 1, 5,
-	3, 28, 15, 6, 21, 10,
-	23, 19, 12, 4, 26, 8,
-	16, 7, 27, 20, 13, 2,
-	41, 52, 31, 37, 47, 55,
-	30, 40, 51, 45, 33, 48,
-	44, 49, 39, 56, 34, 53,
-	46, 42, 50, 36, 29, 32
-};
-
-static uchar perm3[64] = { 58, 50, 42, 34, 26, 18, 10, 2,
-	60, 52, 44, 36, 28, 20, 12, 4,
-	62, 54, 46, 38, 30, 22, 14, 6,
-	64, 56, 48, 40, 32, 24, 16, 8,
-	57, 49, 41, 33, 25, 17, 9, 1,
-	59, 51, 43, 35, 27, 19, 11, 3,
-	61, 53, 45, 37, 29, 21, 13, 5,
-	63, 55, 47, 39, 31, 23, 15, 7
-};
-
-static uchar perm4[48] = { 32, 1, 2, 3, 4, 5,
-	4, 5, 6, 7, 8, 9,
-	8, 9, 10, 11, 12, 13,
-	12, 13, 14, 15, 16, 17,
-	16, 17, 18, 19, 20, 21,
-	20, 21, 22, 23, 24, 25,
-	24, 25, 26, 27, 28, 29,
-	28, 29, 30, 31, 32, 1
-};
-
-static uchar perm5[32] = { 16, 7, 20, 21,
-	29, 12, 28, 17,
-	1, 15, 23, 26,
-	5, 18, 31, 10,
-	2, 8, 24, 14,
-	32, 27, 3, 9,
-	19, 13, 30, 6,
-	22, 11, 4, 25
-};
-
-static uchar perm6[64] = { 40, 8, 48, 16, 56, 24, 64, 32,
-	39, 7, 47, 15, 55, 23, 63, 31,
-	38, 6, 46, 14, 54, 22, 62, 30,
-	37, 5, 45, 13, 53, 21, 61, 29,
-	36, 4, 44, 12, 52, 20, 60, 28,
-	35, 3, 43, 11, 51, 19, 59, 27,
-	34, 2, 42, 10, 50, 18, 58, 26,
-	33, 1, 41, 9, 49, 17, 57, 25
-};
-
-static uchar sc[16] = { 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
-
-static uchar sbox[8][4][16] = {
-	{{14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7},
-	 {0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8},
-	 {4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0},
-	 {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13} },
-
-	{{15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10},
-	 {3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5},
-	 {0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15},
-	 {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9} },
-
-	{{10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8},
-	 {13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1},
-	 {13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7},
-	 {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12} },
-
-	{{7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15},
-	 {13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9},
-	 {10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4},
-	 {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14} },
-
-	{{2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9},
-	 {14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6},
-	 {4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14},
-	 {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3} },
-
-	{{12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11},
-	 {10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8},
-	 {9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6},
-	 {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13} },
-
-	{{4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1},
-	 {13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6},
-	 {1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2},
-	 {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12} },
-
-	{{13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7},
-	 {1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2},
-	 {7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8},
-	 {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11} }
-};
-
-static void
-permute(char *out, char *in, uchar *p, int n)
-{
-	int i;
-	for (i = 0; i < n; i++)
-		out[i] = in[p[i] - 1];
-}
-
-static void
-lshift(char *d, int count, int n)
-{
-	char out[64];
-	int i;
-	for (i = 0; i < n; i++)
-		out[i] = d[(i + count) % n];
-	for (i = 0; i < n; i++)
-		d[i] = out[i];
-}
-
-static void
-concat(char *out, char *in1, char *in2, int l1, int l2)
-{
-	while (l1--)
-		*out++ = *in1++;
-	while (l2--)
-		*out++ = *in2++;
-}
-
-static void
-xor(char *out, char *in1, char *in2, int n)
-{
-	int i;
-	for (i = 0; i < n; i++)
-		out[i] = in1[i] ^ in2[i];
-}
-
-static void
-dohash(char *out, char *in, char *key, int forw)
-{
-	int i, j, k;
-	char *pk1;
-	char c[28];
-	char d[28];
-	char *cd;
-	char (*ki)[48];
-	char *pd1;
-	char l[32], r[32];
-	char *rl;
-
-	/* Have to reduce stack usage */
-	pk1 = kmalloc(56+56+64+64, GFP_KERNEL);
-	if (pk1 == NULL)
-		return;
-
-	ki = kmalloc(16*48, GFP_KERNEL);
-	if (ki == NULL) {
-		kfree(pk1);
-		return;
-	}
-
-	cd = pk1 + 56;
-	pd1 = cd  + 56;
-	rl = pd1 + 64;
-
-	permute(pk1, key, perm1, 56);
-
-	for (i = 0; i < 28; i++)
-		c[i] = pk1[i];
-	for (i = 0; i < 28; i++)
-		d[i] = pk1[i + 28];
-
-	for (i = 0; i < 16; i++) {
-		lshift(c, sc[i], 28);
-		lshift(d, sc[i], 28);
-
-		concat(cd, c, d, 28, 28);
-		permute(ki[i], cd, perm2, 48);
-	}
-
-	permute(pd1, in, perm3, 64);
-
-	for (j = 0; j < 32; j++) {
-		l[j] = pd1[j];
-		r[j] = pd1[j + 32];
-	}
-
-	for (i = 0; i < 16; i++) {
-		char *er;  /* er[48]  */
-		char *erk; /* erk[48] */
-		char b[8][6];
-		char *cb;  /* cb[32]  */
-		char *pcb; /* pcb[32] */
-		char *r2;  /* r2[32]  */
-
-		er = kmalloc(48+48+32+32+32, GFP_KERNEL);
-		if (er == NULL) {
-			kfree(pk1);
-			kfree(ki);
-			return;
-		}
-		erk = er+48;
-		cb  = erk+48;
-		pcb = cb+32;
-		r2  = pcb+32;
-
-		permute(er, r, perm4, 48);
-
-		xor(erk, er, ki[forw ? i : 15 - i], 48);
-
-		for (j = 0; j < 8; j++)
-			for (k = 0; k < 6; k++)
-				b[j][k] = erk[j * 6 + k];
-
-		for (j = 0; j < 8; j++) {
-			int m, n;
-			m = (b[j][0] << 1) | b[j][5];
-
-			n = (b[j][1] << 3) | (b[j][2] << 2) | (b[j][3] <<
-							       1) | b[j][4];
-
-			for (k = 0; k < 4; k++)
-				b[j][k] =
-				    (sbox[j][m][n] & (1 << (3 - k))) ? 1 : 0;
-		}
-
-		for (j = 0; j < 8; j++)
-			for (k = 0; k < 4; k++)
-				cb[j * 4 + k] = b[j][k];
-		permute(pcb, cb, perm5, 32);
-
-		xor(r2, l, pcb, 32);
-
-		for (j = 0; j < 32; j++)
-			l[j] = r[j];
-
-		for (j = 0; j < 32; j++)
-			r[j] = r2[j];
-
-		kfree(er);
-	}
-
-	concat(rl, r, l, 32, 32);
-
-	permute(out, rl, perm6, 64);
-	kfree(pk1);
-	kfree(ki);
-}
-
-static void
-str_to_key(unsigned char *str, unsigned char *key)
-{
-	int i;
-
-	key[0] = str[0] >> 1;
-	key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
-	key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
-	key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
-	key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
-	key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
-	key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
-	key[7] = str[6] & 0x7F;
-	for (i = 0; i < 8; i++)
-		key[i] = (key[i] << 1);
-}
-
-static void
-smbhash(unsigned char *out, const unsigned char *in, unsigned char *key,
-	int forw)
-{
-	int i;
-	char *outb; /* outb[64] */
-	char *inb;  /* inb[64]  */
-	char *keyb; /* keyb[64] */
-	unsigned char key2[8];
-
-	outb = kmalloc(64 * 3, GFP_KERNEL);
-	if (outb == NULL)
-		return;
-
-	inb  = outb + 64;
-	keyb = inb +  64;
-
-	str_to_key(key, key2);
-
-	for (i = 0; i < 64; i++) {
-		inb[i] = (in[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
-		keyb[i] = (key2[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
-		outb[i] = 0;
-	}
-
-	dohash(outb, inb, keyb, forw);
-
-	for (i = 0; i < 8; i++)
-		out[i] = 0;
-
-	for (i = 0; i < 64; i++) {
-		if (outb[i])
-			out[i / 8] |= (1 << (7 - (i % 8)));
-	}
-	kfree(outb);
-}
-
-void
-E_P16(unsigned char *p14, unsigned char *p16)
-{
-	unsigned char sp8[8] =
-	    { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
-	smbhash(p16, sp8, p14, 1);
-	smbhash(p16 + 8, sp8, p14 + 7, 1);
-}
-
-void
-E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
-{
-	smbhash(p24, c8, p21, 1);
-	smbhash(p24 + 8, c8, p21 + 7, 1);
-	smbhash(p24 + 16, c8, p21 + 14, 1);
-}
-
-#if 0 /* currently unused */
-static void
-D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
-	smbhash(out, in, p14, 0);
-	smbhash(out + 8, in + 8, p14 + 7, 0);
-}
-
-static void
-E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
-	smbhash(out, in, p14, 1);
-	smbhash(out + 8, in + 8, p14 + 7, 1);
-}
-/* these routines are currently unneeded, but may be
-	needed later */
-void
-cred_hash1(unsigned char *out, unsigned char *in, unsigned char *key)
-{
-	unsigned char buf[8];
-
-	smbhash(buf, in, key, 1);
-	smbhash(out, buf, key + 9, 1);
-}
-
-void
-cred_hash2(unsigned char *out, unsigned char *in, unsigned char *key)
-{
-	unsigned char buf[8];
-	static unsigned char key2[8];
-
-	smbhash(buf, in, key, 1);
-	key2[0] = key[7];
-	smbhash(out, buf, key2, 1);
-}
-
-void
-cred_hash3(unsigned char *out, unsigned char *in, unsigned char *key, int forw)
-{
-	static unsigned char key2[8];
-
-	smbhash(out, in, key, forw);
-	key2[0] = key[7];
-	smbhash(out + 8, in + 8, key2, forw);
-}
-#endif /* unneeded routines */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5041c84998..43eb1367b10 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -47,6 +47,86 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
 
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+	int i;
+
+	key[0] = str[0] >> 1;
+	key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+	key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+	key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+	key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+	key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+	key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+	key[7] = str[6] & 0x7F;
+	for (i = 0; i < 8; i++)
+		key[i] = (key[i] << 1);
+}
+
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+	int rc;
+	unsigned char key2[8];
+	struct crypto_blkcipher *tfm_des;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+
+	str_to_key(key, key2);
+
+	tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm_des)) {
+		rc = PTR_ERR(tfm_des);
+		cifs_dbg(VFS, "could not allocate des crypto API\n");
+		goto smbhash_err;
+	}
+
+	desc.tfm = tfm_des;
+
+	crypto_blkcipher_setkey(tfm_des, key2, 8);
+
+	sg_init_one(&sgin, in, 8);
+	sg_init_one(&sgout, out, 8);
+
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+	if (rc)
+		cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
+
+	crypto_free_blkcipher(tfm_des);
+smbhash_err:
+	return rc;
+}
+
+static int
+E_P16(unsigned char *p14, unsigned char *p16)
+{
+	int rc;
+	unsigned char sp8[8] =
+	    { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
+
+	rc = smbhash(p16, sp8, p14);
+	if (rc)
+		return rc;
+	rc = smbhash(p16 + 8, sp8, p14 + 7);
+	return rc;
+}
+
+static int
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+	int rc;
+
+	rc = smbhash(p24, c8, p21);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 8, c8, p21 + 7);
+	if (rc)
+		return rc;
+	rc = smbhash(p24 + 16, c8, p21 + 14);
+	return rc;
+}
+
 /* produce a md4 message digest from data of length n bytes */
 int
 mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
@@ -59,14 +139,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
 	md4 = crypto_alloc_shash("md4", 0, 0);
 	if (IS_ERR(md4)) {
 		rc = PTR_ERR(md4);
-		cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+		cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n",
+			 __func__, rc);
 		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
 	sdescmd4 = kmalloc(size, GFP_KERNEL);
 	if (!sdescmd4) {
 		rc = -ENOMEM;
-		cERROR(1, "%s: Memory allocation failure\n", __func__);
 		goto mdfour_err;
 	}
 	sdescmd4->shash.tfm = md4;
@@ -74,11 +154,17 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
 
 	rc = crypto_shash_init(&sdescmd4->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init md4 shash\n", __func__);
+		cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__);
+		goto mdfour_err;
+	}
+	rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
 		goto mdfour_err;
 	}
-	crypto_shash_update(&sdescmd4->shash, link_str, link_len);
 	rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
 
 mdfour_err:
 	crypto_free_shash(md4);
@@ -87,74 +173,30 @@ mdfour_err:
 	return rc;
 }
 
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-	      unsigned char p24[24])
-{
-	unsigned char p21[21];
-
-	memset(p21, '\0', 21);
-
-	memcpy(p21, passwd, 16);
-	E_P24(p21, c8, p24);
-}
-
 /*
    This implements the X/Open SMB password encryption
    It takes a password, a 8 byte "crypt key" and puts 24 bytes of
    encrypted password into p24 */
 /* Note that password must be uppercased and null terminated */
-void
+int
 SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
 {
-	unsigned char p14[15], p21[21];
+	int rc;
+	unsigned char p14[14], p16[16], p21[21];
 
-	memset(p21, '\0', 21);
 	memset(p14, '\0', 14);
-	strncpy((char *) p14, (char *) passwd, 14);
-
-/*	strupper((char *)p14); *//* BB at least uppercase the easy range */
-	E_P16(p14, p21);
-
-	SMBOWFencrypt(p21, c8, p24);
-
-	memset(p14, 0, 15);
-	memset(p21, 0, 21);
-}
+	memset(p16, '\0', 16);
+	memset(p21, '\0', 21);
 
-/* Routines for Windows NT MD4 Hash functions. */
-static int
-_my_wcslen(__u16 *str)
-{
-	int len = 0;
-	while (*str++ != 0)
-		len++;
-	return len;
-}
+	memcpy(p14, passwd, 14);
+	rc = E_P16(p14, p16);
+	if (rc)
+		return rc;
 
-/*
- * Convert a string into an NT UNICODE string.
- * Note that regardless of processor type
- * this must be in intel (little-endian)
- * format.
- */
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
 
-static int
-_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
-{	/* BB not a very good conversion routine - change/fix */
-	int i;
-	__u16 val;
-
-	for (i = 0; i < len; i++) {
-		val = *src;
-		SSVAL(dst, 0, val);
-		dst++;
-		src++;
-		if (val == 0)
-			break;
-	}
-	return i;
+	return rc;
 }
 
 /*
@@ -162,168 +204,45 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
  */
 
 int
-E_md4hash(const unsigned char *passwd, unsigned char *p16)
+E_md4hash(const unsigned char *passwd, unsigned char *p16,
+	const struct nls_table *codepage)
 {
 	int rc;
 	int len;
-	__u16 wpwd[129];
+	__le16 wpwd[129];
 
 	/* Password cannot be longer than 128 characters */
-	if (passwd) {
-		len = strlen((char *) passwd);
-		if (len > 128)
-			len = 128;
-
-		/* Password must be converted to NT unicode */
-		_my_mbstowcs(wpwd, passwd, len);
-	} else
+	if (passwd) /* Password must be converted to NT unicode */
+		len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
+	else {
 		len = 0;
+		*wpwd = 0; /* Ensure string is null terminated */
+	}
 
-	wpwd[len] = 0;	/* Ensure string is null terminated */
-	/* Calculate length in bytes */
-	len = _my_wcslen(wpwd) * sizeof(__u16);
-
-	rc = mdfour(p16, (unsigned char *) wpwd, len);
-	memset(wpwd, 0, 129 * 2);
+	rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
+	memset(wpwd, 0, 129 * sizeof(__le16));
 
 	return rc;
 }
 
-#if 0 /* currently unused */
-/* Does both the NT and LM owfs of a user's password */
-static void
-nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
-{
-	char passwd[514];
-
-	memset(passwd, '\0', 514);
-	if (strlen(pwd) < 513)
-		strcpy(passwd, pwd);
-	else
-		memcpy(passwd, pwd, 512);
-	/* Calculate the MD4 hash (NT compatible) of the password */
-	memset(nt_p16, '\0', 16);
-	E_md4hash(passwd, nt_p16);
-
-	/* Mangle the passwords into Lanman format */
-	passwd[14] = '\0';
-/*	strupper(passwd); */
-
-	/* Calculate the SMB (lanman) hash functions of the password */
-
-	memset(p16, '\0', 16);
-	E_P16((unsigned char *) passwd, (unsigned char *) p16);
-
-	/* clear out local copy of user's password (just being paranoid). */
-	memset(passwd, '\0', sizeof(passwd));
-}
-#endif
-
-/* Does the NTLMv2 owfs of a user's password */
-#if 0  /* function not needed yet - but will be soon */
-static void
-ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
-		const char *domain_n, unsigned char kr_buf[16],
-		const struct nls_table *nls_codepage)
-{
-	wchar_t *user_u;
-	wchar_t *dom_u;
-	int user_l, domain_l;
-	struct HMACMD5Context ctx;
-
-	/* might as well do one alloc to hold both (user_u and dom_u) */
-	user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
-	if (user_u == NULL)
-		return;
-	dom_u = user_u + 1024;
-
-	/* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
-			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
-	   push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
-			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
-
-	/* BB user and domain may need to be uppercased */
-	user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
-	domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
-
-	user_l++;		/* trailing null */
-	domain_l++;
-
-	hmac_md5_init_limK_to_64(owf, 16, &ctx);
-	hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
-	hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
-	hmac_md5_final(kr_buf, &ctx);
-
-	kfree(user_u);
-}
-#endif
-
-/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-#if 0 /* currently unused */
-static void
-NTLMSSPOWFencrypt(unsigned char passwd[8],
-		  unsigned char *ntlmchalresp, unsigned char p24[24])
-{
-	unsigned char p21[21];
-
-	memset(p21, '\0', 21);
-	memcpy(p21, passwd, 8);
-	memset(p21 + 8, 0xbd, 8);
-
-	E_P24(p21, ntlmchalresp, p24);
-}
-#endif
-
 /* Does the NT MD4 hash then des encryption. */
 int
-SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
+		const struct nls_table *codepage)
 {
 	int rc;
-	unsigned char p21[21];
+	unsigned char p16[16], p21[21];
 
+	memset(p16, '\0', 16);
 	memset(p21, '\0', 21);
 
-	rc = E_md4hash(passwd, p21);
+	rc = E_md4hash(passwd, p16, codepage);
 	if (rc) {
-		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+			 __func__, rc);
 		return rc;
 	}
-	SMBOWFencrypt(p21, c8, p24);
+	memcpy(p21, p16, 16);
+	rc = E_P24(p21, c8, p24);
 	return rc;
 }
-
-
-/* Does the md5 encryption from the NT hash for NTLMv2. */
-/* These routines will be needed later */
-#if 0
-static void
-SMBOWFencrypt_ntv2(const unsigned char kr[16],
-		   const struct data_blob *srv_chal,
-		   const struct data_blob *cli_chal, unsigned char resp_buf[16])
-{
-	struct HMACMD5Context ctx;
-
-	hmac_md5_init_limK_to_64(kr, 16, &ctx);
-	hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
-	hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
-	hmac_md5_final(resp_buf, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv2(const unsigned char kr[16],
-		   const unsigned char *nt_resp, __u8 sess_key[16])
-{
-	struct HMACMD5Context ctx;
-
-	hmac_md5_init_limK_to_64(kr, 16, &ctx);
-	hmac_md5_update(nt_resp, 16, &ctx);
-	hmac_md5_final((unsigned char *) sess_key, &ctx);
-}
-
-static void
-SMBsesskeygen_ntv1(const unsigned char kr[16],
-		   const unsigned char *nt_resp, __u8 sess_key[16])
-{
-	mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
-}
-#endif
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e08..0e538b5c962 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Copyright (c) International Business Machines  Corp., 2002,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
 /* IOCTL information */
 /*
  * List of ioctl/fsctl function codes that are or could be useful in the
- * future to remote clients like cifs or SMB2 client.  There is probably
+ * future to remote clients like cifs or SMB2/SMB3 client.  This is probably
  * a slightly larger set of fsctls that NTFS local filesystem could handle,
  * including the seven below that we do not have struct definitions for.
  * Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
  * remotely.  Some of the following, such as the encryption/compression ones
  * could be invoked from tools via a specialized hook into the VFS rather
  * than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
  */
+#define FSCTL_DFS_GET_REFERRALS      0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
 #define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
 #define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
 #define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
@@ -71,14 +77,45 @@
 #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
 #define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
 #define FSCTL_SIS_LINK_FILES         0x0009C104
 #define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
 #define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
 /* strange that the number for this op is not sequential with previous op */
 #define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
 #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
 #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
 
+/* See FSCC 2.1.2.5 */
 #define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
 #define IO_REPARSE_TAG_HSM           0xC0000004
 #define IO_REPARSE_TAG_SIS           0x80000007
+#define IO_REPARSE_TAG_HSM2          0x80000006
+#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005
+/* Used by the DFS filter. See MS-DFSC */
+#define IO_REPARSE_TAG_DFS           0x8000000A
+/* Used by the DFS filter See MS-DFSC */
+#define IO_REPARSE_TAG_DFSR          0x80000012
+#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B
+/* See section MS-FSCC 2.1.2.4 */
+#define IO_REPARSE_TAG_SYMLINK       0xA000000C
+#define IO_REPARSE_TAG_DEDUP         0x80000013
+#define IO_REPARSE_APPXSTREAM	     0xC0000014
+/* NFS symlinks, Win 8/SMB3 and later */
+#define IO_REPARSE_TAG_NFS           0x80000014
+
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL		0x00000001
+
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 46d8756f2b2..18cd5650a5f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,9 @@
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/tcp.h>
+#include <linux/highmem.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
@@ -34,10 +37,8 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-extern mempool_t *cifs_mid_poolp;
-
-static void
-wake_up_task(struct mid_q_entry *mid)
+void
+cifs_wake_up_task(struct mid_q_entry *mid)
 {
 	wake_up_process(mid->callback_data);
 }
@@ -48,7 +49,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	struct mid_q_entry *temp;
 
 	if (server == NULL) {
-		cERROR(1, "Null TCP session in AllocMidQEntry");
+		cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n");
 		return NULL;
 	}
 
@@ -57,24 +58,25 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		return temp;
 	else {
 		memset(temp, 0, sizeof(struct mid_q_entry));
-		temp->mid = smb_buffer->Mid;	/* always LE */
+		temp->mid = get_mid(smb_buffer);
 		temp->pid = current->pid;
-		temp->command = smb_buffer->Command;
-		cFYI(1, "For smb_command %d", temp->command);
+		temp->command = cpu_to_le16(smb_buffer->Command);
+		cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
 		/* when mid allocated can be before when sent */
 		temp->when_alloc = jiffies;
+		temp->server = server;
 
 		/*
 		 * The default is for the mid to be synchronous, so the
 		 * default callback just wakes up the current task.
 		 */
-		temp->callback = wake_up_task;
+		temp->callback = cifs_wake_up_task;
 		temp->callback_data = current;
 	}
 
 	atomic_inc(&midCount);
-	temp->midState = MID_REQUEST_ALLOCATED;
+	temp->mid_state = MID_REQUEST_ALLOCATED;
 	return temp;
 }
 
@@ -82,11 +84,12 @@ void
 DeleteMidQEntry(struct mid_q_entry *midEntry)
 {
 #ifdef CONFIG_CIFS_STATS2
+	__le16 command = midEntry->server->vals->lock_cmd;
 	unsigned long now;
 #endif
-	midEntry->midState = MID_FREE;
+	midEntry->mid_state = MID_FREE;
 	atomic_dec(&midCount);
-	if (midEntry->largeBuf)
+	if (midEntry->large_buf)
 		cifs_buf_release(midEntry->resp_buf);
 	else
 		cifs_small_buf_release(midEntry->resp_buf);
@@ -95,9 +98,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
 	/* commands taking longer than one second are indications that
 	   something is wrong, unless it is quite a slow link or server */
 	if ((now - midEntry->when_alloc) > HZ) {
-		if ((cifsFYI & CIFS_TIMER) &&
-		   (midEntry->command != SMB_COM_LOCKING_ANDX)) {
-			printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %d",
+		if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) {
+			printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %llu",
 			       midEntry->command, midEntry->mid);
 			printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
 			       now - midEntry->when_alloc,
@@ -109,8 +111,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
 	mempool_free(midEntry, cifs_mid_poolp);
 }
 
-static void
-delete_mid(struct mid_q_entry *mid)
+void
+cifs_delete_mid(struct mid_q_entry *mid)
 {
 	spin_lock(&GlobalMid_Lock);
 	list_del(&mid->qhead);
@@ -119,21 +121,28 @@ delete_mid(struct mid_q_entry *mid)
 	DeleteMidQEntry(mid);
 }
 
+/*
+ * smb_send_kvec - send an array of kvecs to the server
+ * @server:	Server to send the data to
+ * @iov:	Pointer to array of kvecs
+ * @n_vec:	length of kvec array
+ * @sent:	amount of data sent on socket is stored here
+ *
+ * Our basic "send data to server" function. Should be called with srv_mutex
+ * held. The caller is responsible for handling the results.
+ */
 static int
-smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
+smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
+		size_t *sent)
 {
 	int rc = 0;
 	int i = 0;
 	struct msghdr smb_msg;
-	struct smb_hdr *smb_buffer = iov[0].iov_base;
-	unsigned int len = iov[0].iov_len;
-	unsigned int total_len;
-	int first_vec = 0;
-	unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+	unsigned int remaining;
+	size_t first_vec = 0;
 	struct socket *ssocket = server->ssocket;
 
-	if (ssocket == NULL)
-		return -ENOTSOCK; /* BB eventually add reconnect code here */
+	*sent = 0;
 
 	smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
@@ -144,71 +153,70 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	else
 		smb_msg.msg_flags = MSG_NOSIGNAL;
 
-	/* smb header is converted in header_assemble. bcc and rest of SMB word
-	   area, and byte area if necessary, is converted to littleendian in
-	   cifssmb.c and RFC1001 len is converted to bigendian in smb_send
-	   Flags2 is converted in SendReceive */
-
-
-	total_len = 0;
+	remaining = 0;
 	for (i = 0; i < n_vec; i++)
-		total_len += iov[i].iov_len;
-
-	smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-	cFYI(1, "Sending smb:  total_len %d", total_len);
-	dump_smb(smb_buffer, len);
+		remaining += iov[i].iov_len;
 
 	i = 0;
-	while (total_len) {
+	while (remaining) {
+		/*
+		 * If blocking send, we try 3 times, since each can block
+		 * for 5 seconds. For nonblocking  we have to try more
+		 * but wait increasing amounts of time allowing time for
+		 * socket to clear.  The overall time we wait in either
+		 * case to send on the socket is about 15 seconds.
+		 * Similarly we wait for 15 seconds for a response from
+		 * the server in SendReceive[2] for the server to send
+		 * a response back for most types of requests (except
+		 * SMB Write past end of file which can be slow, and
+		 * blocking lock operations). NFS waits slightly longer
+		 * than CIFS, but this can make it take longer for
+		 * nonresponsive servers to be detected and 15 seconds
+		 * is more than enough time for modern networks to
+		 * send a packet.  In most cases if we fail to send
+		 * after the retries we will kill the socket and
+		 * reconnect which may clear the network problem.
+		 */
 		rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
-				    n_vec - first_vec, total_len);
-		if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
+				    n_vec - first_vec, remaining);
+		if (rc == -EAGAIN) {
 			i++;
-			/* if blocking send we try 3 times, since each can block
-			   for 5 seconds. For nonblocking  we have to try more
-			   but wait increasing amounts of time allowing time for
-			   socket to clear.  The overall time we wait in either
-			   case to send on the socket is about 15 seconds.
-			   Similarly we wait for 15 seconds for
-			   a response from the server in SendReceive[2]
-			   for the server to send a response back for
-			   most types of requests (except SMB Write
-			   past end of file which can be slow, and
-			   blocking lock operations). NFS waits slightly longer
-			   than CIFS, but this can make it take longer for
-			   nonresponsive servers to be detected and 15 seconds
-			   is more than enough time for modern networks to
-			   send a packet.  In most cases if we fail to send
-			   after the retries we will kill the socket and
-			   reconnect which may clear the network problem.
-			*/
-			if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
-				cERROR(1, "sends on sock %p stuck for 15 seconds",
-				    ssocket);
+			if (i >= 14 || (!server->noblocksnd && (i > 2))) {
+				cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
+					 ssocket);
 				rc = -EAGAIN;
 				break;
 			}
 			msleep(1 << i);
 			continue;
 		}
+
 		if (rc < 0)
 			break;
 
-		if (rc == total_len) {
-			total_len = 0;
+		/* send was at least partially successful */
+		*sent += rc;
+
+		if (rc == remaining) {
+			remaining = 0;
 			break;
-		} else if (rc > total_len) {
-			cERROR(1, "sent %d requested %d", rc, total_len);
+		}
+
+		if (rc > remaining) {
+			cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining);
 			break;
 		}
+
 		if (rc == 0) {
 			/* should never happen, letting socket clear before
 			   retrying is our only obvious option here */
-			cERROR(1, "tcp sent no data");
+			cifs_dbg(VFS, "tcp sent no data\n");
 			msleep(500);
 			continue;
 		}
-		total_len -= rc;
+
+		remaining -= rc;
+
 		/* the line below resets i */
 		for (i = first_vec; i < n_vec; i++) {
 			if (iov[i].iov_len) {
@@ -223,31 +231,150 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 				}
 			}
 		}
+
 		i = 0; /* in case we get ENOSPC on the next send */
+		rc = 0;
+	}
+	return rc;
+}
+
+/**
+ * rqst_page_to_kvec - Turn a slot in the smb_rqst page array into a kvec
+ * @rqst: pointer to smb_rqst
+ * @idx: index into the array of the page
+ * @iov: pointer to struct kvec that will hold the result
+ *
+ * Helper function to convert a slot in the rqst->rq_pages array into a kvec.
+ * The page will be kmapped and the address placed into iov_base. The length
+ * will then be adjusted according to the ptailoff.
+ */
+void
+cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
+			struct kvec *iov)
+{
+	/*
+	 * FIXME: We could avoid this kmap altogether if we used
+	 * kernel_sendpage instead of kernel_sendmsg. That will only
+	 * work if signing is disabled though as sendpage inlines the
+	 * page directly into the fraglist. If userspace modifies the
+	 * page after we calculate the signature, then the server will
+	 * reject it and may break the connection. kernel_sendmsg does
+	 * an extra copy of the data and avoids that issue.
+	 */
+	iov->iov_base = kmap(rqst->rq_pages[idx]);
+
+	/* if last page, don't send beyond this offset into page */
+	if (idx == (rqst->rq_npages - 1))
+		iov->iov_len = rqst->rq_tailsz;
+	else
+		iov->iov_len = rqst->rq_pagesz;
+}
+
+static unsigned long
+rqst_len(struct smb_rqst *rqst)
+{
+	unsigned int i;
+	struct kvec *iov = rqst->rq_iov;
+	unsigned long buflen = 0;
+
+	/* total up iov array first */
+	for (i = 0; i < rqst->rq_nvec; i++)
+		buflen += iov[i].iov_len;
+
+	/* add in the page array if there is one */
+	if (rqst->rq_npages) {
+		buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
+		buflen += rqst->rq_tailsz;
+	}
+
+	return buflen;
+}
+
+static int
+smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	int rc;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
+	unsigned long send_length;
+	unsigned int i;
+	size_t total_len = 0, sent;
+	struct socket *ssocket = server->ssocket;
+	int val = 1;
+
+	if (ssocket == NULL)
+		return -ENOTSOCK;
+
+	/* sanity check send length */
+	send_length = rqst_len(rqst);
+	if (send_length != smb_buf_length + 4) {
+		WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n",
+			send_length, smb_buf_length);
+		return -EIO;
+	}
+
+	cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
+	dump_smb(iov[0].iov_base, iov[0].iov_len);
+
+	/* cork the socket */
+	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
+				(char *)&val, sizeof(val));
+
+	rc = smb_send_kvec(server, iov, n_vec, &sent);
+	if (rc < 0)
+		goto uncork;
+
+	total_len += sent;
+
+	/* now walk the page array and send each page in it */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		rc = smb_send_kvec(server, &p_iov, 1, &sent);
+		kunmap(rqst->rq_pages[i]);
+		if (rc < 0)
+			break;
+
+		total_len += sent;
 	}
 
+uncork:
+	/* uncork it */
+	val = 0;
+	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
+				(char *)&val, sizeof(val));
+
 	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-		cFYI(1, "partial send (%d remaining), terminating session",
-			total_len);
-		/* If we have only sent part of an SMB then the next SMB
-		   could be taken as the remainder of this one.  We need
-		   to kill the socket so the server throws away the partial
-		   SMB */
+		cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
+			 smb_buf_length + 4, total_len);
+		/*
+		 * If we have only sent part of an SMB then the next SMB could
+		 * be taken as the remainder of this one. We need to kill the
+		 * socket so the server throws away the partial SMB
+		 */
 		server->tcpStatus = CifsNeedReconnect;
 	}
 
 	if (rc < 0 && rc != -EINTR)
-		cERROR(1, "Error %d sending data on socket to server", rc);
+		cifs_dbg(VFS, "Error %d sending data on socket to server\n",
+			 rc);
 	else
 		rc = 0;
 
-	/* Don't want to modify the buffer as a
-	   side effect of this call. */
-	smb_buffer->smb_buf_length = smb_buf_length;
-
 	return rc;
 }
 
+static int
+smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
+{
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = n_vec };
+
+	return smb_send_rqst(server, &rqst);
+}
+
 int
 smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
 	 unsigned int smb_buf_length)
@@ -260,49 +387,68 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
 	return smb_sendv(server, &iov, 1);
 }
 
-static int wait_for_free_request(struct TCP_Server_Info *server,
-				 const int long_op)
+static int
+wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
+		      int *credits)
 {
-	if (long_op == CIFS_ASYNC_OP) {
+	int rc;
+
+	spin_lock(&server->req_lock);
+	if (timeout == CIFS_ASYNC_OP) {
 		/* oplock breaks must not be held up */
-		atomic_inc(&server->inFlight);
+		server->in_flight++;
+		*credits -= 1;
+		spin_unlock(&server->req_lock);
 		return 0;
 	}
 
-	spin_lock(&GlobalMid_Lock);
 	while (1) {
-		if (atomic_read(&server->inFlight) >= cifs_max_pending) {
-			spin_unlock(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_STATS2
-			atomic_inc(&server->num_waiters);
-#endif
-			wait_event(server->request_q,
-				   atomic_read(&server->inFlight)
-				     < cifs_max_pending);
-#ifdef CONFIG_CIFS_STATS2
-			atomic_dec(&server->num_waiters);
-#endif
-			spin_lock(&GlobalMid_Lock);
+		if (*credits <= 0) {
+			spin_unlock(&server->req_lock);
+			cifs_num_waiters_inc(server);
+			rc = wait_event_killable(server->request_q,
+						 has_credits(server, credits));
+			cifs_num_waiters_dec(server);
+			if (rc)
+				return rc;
+			spin_lock(&server->req_lock);
 		} else {
 			if (server->tcpStatus == CifsExiting) {
-				spin_unlock(&GlobalMid_Lock);
+				spin_unlock(&server->req_lock);
 				return -ENOENT;
 			}
 
-			/* can not count locking commands against total
-			   as they are allowed to block on server */
+			/*
+			 * Can not count locking commands against total
+			 * as they are allowed to block on server.
+			 */
 
 			/* update # of requests on the wire to server */
-			if (long_op != CIFS_BLOCKING_OP)
-				atomic_inc(&server->inFlight);
-			spin_unlock(&GlobalMid_Lock);
+			if (timeout != CIFS_BLOCKING_OP) {
+				*credits -= 1;
+				server->in_flight++;
+			}
+			spin_unlock(&server->req_lock);
 			break;
 		}
 	}
 	return 0;
 }
 
-static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
+static int
+wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
+		      const int optype)
+{
+	int *val;
+
+	val = server->ops->get_credits_field(server, optype);
+	/* Since an echo is already inflight, no need to wait to send another */
+	if (*val <= 0 && optype == CIFS_ECHO_OP)
+		return -EAGAIN;
+	return wait_for_free_credits(server, timeout, val);
+}
+
+static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
 			struct mid_q_entry **ppmidQ)
 {
 	if (ses->server->tcpStatus == CifsExiting) {
@@ -310,17 +456,24 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 	}
 
 	if (ses->server->tcpStatus == CifsNeedReconnect) {
-		cFYI(1, "tcp session dead - return to caller to retry");
+		cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
 		return -EAGAIN;
 	}
 
-	if (ses->status != CifsGood) {
-		/* check if SMB session is bad because we are setting it up */
+	if (ses->status == CifsNew) {
 		if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
 			(in_buf->Command != SMB_COM_NEGOTIATE))
 			return -EAGAIN;
 		/* else ok - we are setting up session */
 	}
+
+	if (ses->status == CifsExiting) {
+		/* check if SMB session is bad because we are setting it up */
+		if (in_buf->Command != SMB_COM_LOGOFF_ANDX)
+			return -EAGAIN;
+		/* else ok - we are shutting down session */
+	}
+
 	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
 	if (*ppmidQ == NULL)
 		return -ENOMEM;
@@ -335,71 +488,91 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
 	int error;
 
-	error = wait_event_killable(server->response_q,
-				    midQ->midState != MID_REQUEST_SUBMITTED);
+	error = wait_event_freezekillable_unsafe(server->response_q,
+				    midQ->mid_state != MID_REQUEST_SUBMITTED);
 	if (error < 0)
 		return -ERESTARTSYS;
 
 	return 0;
 }
 
+struct mid_q_entry *
+cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
+
+	/* enable signing if server requires it */
+	if (server->sign)
+		hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	mid = AllocMidQEntry(hdr, server);
+	if (mid == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
+	if (rc) {
+		DeleteMidQEntry(mid);
+		return ERR_PTR(rc);
+	}
+
+	return mid;
+}
 
 /*
  * Send a SMB request and set the callback function in the mid to handle
  * the result. Caller is responsible for dealing with timeouts.
  */
 int
-cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
-		mid_callback_t *callback, void *cbdata)
+cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
+		mid_receive_t *receive, mid_callback_t *callback,
+		void *cbdata, const int flags)
 {
-	int rc;
+	int rc, timeout, optype;
 	struct mid_q_entry *mid;
 
-	rc = wait_for_free_request(server, CIFS_ASYNC_OP);
+	timeout = flags & CIFS_TIMEOUT_MASK;
+	optype = flags & CIFS_OP_MASK;
+
+	rc = wait_for_free_request(server, timeout, optype);
 	if (rc)
 		return rc;
 
-	/* enable signing if server requires it */
-	if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
 	mutex_lock(&server->srv_mutex);
-	mid = AllocMidQEntry(in_buf, server);
-	if (mid == NULL) {
+	mid = server->ops->setup_async_request(server, rqst);
+	if (IS_ERR(mid)) {
 		mutex_unlock(&server->srv_mutex);
-		return -ENOMEM;
+		add_credits(server, 1, optype);
+		wake_up(&server->request_q);
+		return PTR_ERR(mid);
 	}
 
+	mid->receive = receive;
+	mid->callback = callback;
+	mid->callback_data = cbdata;
+	mid->mid_state = MID_REQUEST_SUBMITTED;
+
 	/* put it on the pending_mid_q */
 	spin_lock(&GlobalMid_Lock);
 	list_add_tail(&mid->qhead, &server->pending_mid_q);
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
-	if (rc) {
-		mutex_unlock(&server->srv_mutex);
-		goto out_err;
-	}
 
-	mid->callback = callback;
-	mid->callback_data = cbdata;
-	mid->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-	atomic_inc(&server->inSend);
-#endif
-	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
-#ifdef CONFIG_CIFS_STATS2
-	atomic_dec(&server->inSend);
-	mid->when_sent = jiffies;
-#endif
+	cifs_in_send_inc(server);
+	rc = smb_send_rqst(server, rqst);
+	cifs_in_send_dec(server);
+	cifs_save_when_sent(mid);
+
+	if (rc < 0)
+		server->sequence_number -= 2;
 	mutex_unlock(&server->srv_mutex);
-	if (rc)
-		goto out_err;
 
-	return rc;
-out_err:
-	delete_mid(mid);
-	atomic_dec(&server->inFlight);
+	if (rc == 0)
+		return 0;
+
+	cifs_delete_mid(mid);
+	add_credits(server, 1, optype);
 	wake_up(&server->request_q);
 	return rc;
 }
@@ -414,55 +587,48 @@ out_err:
  *
  */
 int
-SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
-		struct smb_hdr *in_buf, int flags)
+SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+		 char *in_buf, int flags)
 {
 	int rc;
 	struct kvec iov[1];
 	int resp_buf_type;
 
-	iov[0].iov_base = (char *)in_buf;
-	iov[0].iov_len = in_buf->smb_buf_length + 4;
+	iov[0].iov_base = in_buf;
+	iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
 	flags |= CIFS_NO_RESP;
 	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-	cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
+	cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
 
 	return rc;
 }
 
 static int
-sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 {
 	int rc = 0;
 
-	cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
-		mid->mid, mid->midState);
+	cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
+		 __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
 
 	spin_lock(&GlobalMid_Lock);
-	/* ensure that it's no longer on the pending_mid_q */
-	list_del_init(&mid->qhead);
-
-	switch (mid->midState) {
+	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
 		spin_unlock(&GlobalMid_Lock);
 		return rc;
-	case MID_REQUEST_SUBMITTED:
-		/* socket is going down, reject all calls */
-		if (server->tcpStatus == CifsExiting) {
-			cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
-			       __func__, mid->mid, mid->command, mid->midState);
-			rc = -EHOSTDOWN;
-			break;
-		}
 	case MID_RETRY_NEEDED:
 		rc = -EAGAIN;
 		break;
 	case MID_RESPONSE_MALFORMED:
 		rc = -EIO;
 		break;
+	case MID_SHUTDOWN:
+		rc = -EHOSTDOWN;
+		break;
 	default:
-		cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
-			mid->mid, mid->midState);
+		list_del_init(&mid->qhead);
+		cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
+			 __func__, mid->mid, mid->mid_state);
 		rc = -EIO;
 	}
 	spin_unlock(&GlobalMid_Lock);
@@ -471,223 +637,203 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	return rc;
 }
 
-/*
- * An NT cancel request header looks just like the original request except:
- *
- * The Command is SMB_COM_NT_CANCEL
- * The WordCount is zeroed out
- * The ByteCount is zeroed out
- *
- * This function mangles an existing request buffer into a
- * SMB_COM_NT_CANCEL request and then sends it.
- */
-static int
-send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
-		struct mid_q_entry *mid)
+static inline int
+send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid)
 {
-	int rc = 0;
+	return server->ops->send_cancel ?
+				server->ops->send_cancel(server, buf, mid) : 0;
+}
 
-	/* -4 for RFC1001 length and +2 for BCC field */
-	in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4  + 2;
-	in_buf->Command = SMB_COM_NT_CANCEL;
-	in_buf->WordCount = 0;
-	put_bcc_le(0, in_buf);
+int
+cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+		   bool log_error)
+{
+	unsigned int len = get_rfc1002_length(mid->resp_buf) + 4;
 
-	mutex_lock(&server->srv_mutex);
-	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
-	if (rc) {
-		mutex_unlock(&server->srv_mutex);
-		return rc;
+	dump_smb(mid->resp_buf, min_t(u32, 92, len));
+
+	/* convert the length into a more usable form */
+	if (server->sign) {
+		struct kvec iov;
+		int rc = 0;
+		struct smb_rqst rqst = { .rq_iov = &iov,
+					 .rq_nvec = 1 };
+
+		iov.iov_base = mid->resp_buf;
+		iov.iov_len = len;
+		/* FIXME: add code to kill session */
+		rc = cifs_verify_signature(&rqst, server,
+					   mid->sequence_number);
+		if (rc)
+			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				 rc);
 	}
-	rc = smb_send(server, in_buf, in_buf->smb_buf_length);
-	mutex_unlock(&server->srv_mutex);
 
-	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
-		in_buf->Mid, rc);
+	/* BB special case reconnect tid and uid here? */
+	return map_smb_to_linux_error(mid->resp_buf, log_error);
+}
+
+struct mid_q_entry *
+cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+{
+	int rc;
+	struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+	struct mid_q_entry *mid;
 
-	return rc;
+	rc = allocate_mid(ses, hdr, &mid);
+	if (rc)
+		return ERR_PTR(rc);
+	rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
+	if (rc) {
+		cifs_delete_mid(mid);
+		return ERR_PTR(rc);
+	}
+	return mid;
 }
 
 int
-SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
-	     struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
+	     struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
 	     const int flags)
 {
 	int rc = 0;
-	int long_op;
-	unsigned int receive_len;
+	int timeout, optype;
 	struct mid_q_entry *midQ;
-	struct smb_hdr *in_buf = iov[0].iov_base;
+	char *buf = iov[0].iov_base;
+	unsigned int credits = 1;
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = n_vec };
 
-	long_op = flags & CIFS_TIMEOUT_MASK;
+	timeout = flags & CIFS_TIMEOUT_MASK;
+	optype = flags & CIFS_OP_MASK;
 
-	*pRespBufType = CIFS_NO_BUFFER;  /* no response buf yet */
+	*resp_buf_type = CIFS_NO_BUFFER;  /* no response buf yet */
 
 	if ((ses == NULL) || (ses->server == NULL)) {
-		cifs_small_buf_release(in_buf);
-		cERROR(1, "Null session");
+		cifs_small_buf_release(buf);
+		cifs_dbg(VFS, "Null session\n");
 		return -EIO;
 	}
 
 	if (ses->server->tcpStatus == CifsExiting) {
-		cifs_small_buf_release(in_buf);
+		cifs_small_buf_release(buf);
 		return -ENOENT;
 	}
 
-	/* Ensure that we do not send more than 50 overlapping requests
-	   to the same server. We may make this configurable later or
-	   use ses->maxReq */
+	/*
+	 * Ensure that we do not send more than 50 overlapping requests
+	 * to the same server. We may make this configurable later or
+	 * use ses->maxReq.
+	 */
 
-	rc = wait_for_free_request(ses->server, long_op);
+	rc = wait_for_free_request(ses->server, timeout, optype);
 	if (rc) {
-		cifs_small_buf_release(in_buf);
+		cifs_small_buf_release(buf);
 		return rc;
 	}
 
-	/* make sure that we sign in the same order that we send on this socket
-	   and avoid races inside tcp sendmsg code that could cause corruption
-	   of smb data */
+	/*
+	 * Make sure that we sign in the same order that we send on this socket
+	 * and avoid races inside tcp sendmsg code that could cause corruption
+	 * of smb data.
+	 */
 
 	mutex_lock(&ses->server->srv_mutex);
 
-	rc = allocate_mid(ses, in_buf, &midQ);
-	if (rc) {
+	midQ = ses->server->ops->setup_request(ses, &rqst);
+	if (IS_ERR(midQ)) {
 		mutex_unlock(&ses->server->srv_mutex);
-		cifs_small_buf_release(in_buf);
+		cifs_small_buf_release(buf);
 		/* Update # of requests on wire to server */
-		atomic_dec(&ses->server->inFlight);
-		wake_up(&ses->server->request_q);
-		return rc;
-	}
-	rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
-	if (rc) {
-		mutex_unlock(&ses->server->srv_mutex);
-		cifs_small_buf_release(in_buf);
-		goto out;
+		add_credits(ses->server, 1, optype);
+		return PTR_ERR(midQ);
 	}
 
-	midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-	atomic_inc(&ses->server->inSend);
-#endif
+	midQ->mid_state = MID_REQUEST_SUBMITTED;
+	cifs_in_send_inc(ses->server);
 	rc = smb_sendv(ses->server, iov, n_vec);
-#ifdef CONFIG_CIFS_STATS2
-	atomic_dec(&ses->server->inSend);
-	midQ->when_sent = jiffies;
-#endif
+	cifs_in_send_dec(ses->server);
+	cifs_save_when_sent(midQ);
 
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0) {
-		cifs_small_buf_release(in_buf);
+		cifs_small_buf_release(buf);
 		goto out;
 	}
 
-	if (long_op == CIFS_ASYNC_OP) {
-		cifs_small_buf_release(in_buf);
+	if (timeout == CIFS_ASYNC_OP) {
+		cifs_small_buf_release(buf);
 		goto out;
 	}
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
-		send_nt_cancel(ses->server, in_buf, midQ);
+		send_cancel(ses->server, buf, midQ);
 		spin_lock(&GlobalMid_Lock);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
-			cifs_small_buf_release(in_buf);
-			atomic_dec(&ses->server->inFlight);
-			wake_up(&ses->server->request_q);
+			cifs_small_buf_release(buf);
+			add_credits(ses->server, 1, optype);
 			return rc;
 		}
 		spin_unlock(&GlobalMid_Lock);
 	}
 
-	cifs_small_buf_release(in_buf);
+	cifs_small_buf_release(buf);
 
-	rc = sync_mid_result(midQ, ses->server);
+	rc = cifs_sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
-		atomic_dec(&ses->server->inFlight);
-		wake_up(&ses->server->request_q);
+		add_credits(ses->server, 1, optype);
 		return rc;
 	}
 
-	receive_len = midQ->resp_buf->smb_buf_length;
-
-	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid);
+	if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
+		cifs_dbg(FYI, "Bad MID state?\n");
 		goto out;
 	}
 
-	/* rcvd frame is ok */
+	buf = (char *)midQ->resp_buf;
+	iov[0].iov_base = buf;
+	iov[0].iov_len = get_rfc1002_length(buf) + 4;
+	if (midQ->large_buf)
+		*resp_buf_type = CIFS_LARGE_BUFFER;
+	else
+		*resp_buf_type = CIFS_SMALL_BUFFER;
 
-	if (midQ->resp_buf &&
-	    (midQ->midState == MID_RESPONSE_RECEIVED)) {
-
-		iov[0].iov_base = (char *)midQ->resp_buf;
-		if (midQ->largeBuf)
-			*pRespBufType = CIFS_LARGE_BUFFER;
-		else
-			*pRespBufType = CIFS_SMALL_BUFFER;
-		iov[0].iov_len = receive_len + 4;
-
-		dump_smb(midQ->resp_buf, 80);
-		/* convert the length into a more usable form */
-		if ((receive_len > 24) &&
-		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-					     SECMODE_SIGN_ENABLED))) {
-			rc = cifs_verify_signature(midQ->resp_buf,
-						ses->server,
-						midQ->sequence_number+1);
-			if (rc) {
-				cERROR(1, "Unexpected SMB signature");
-				/* BB FIXME add code to kill session */
-			}
-		}
+	credits = ses->server->ops->get_credits(midQ);
 
-		/* BB special case reconnect tid and uid here? */
-		rc = map_smb_to_linux_error(midQ->resp_buf,
-					    flags & CIFS_LOG_ERROR);
-
-		/* convert ByteCount if necessary */
-		if (receive_len >= sizeof(struct smb_hdr) - 4
-		    /* do not count RFC1001 header */  +
-		    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-		if ((flags & CIFS_NO_RESP) == 0)
-			midQ->resp_buf = NULL;  /* mark it so buf will
-						   not be freed by
-						   delete_mid */
-	} else {
-		rc = -EIO;
-		cFYI(1, "Bad MID state?");
-	}
+	rc = ses->server->ops->check_receive(midQ, ses->server,
+					     flags & CIFS_LOG_ERROR);
 
+	/* mark it so buf will not be freed by cifs_delete_mid */
+	if ((flags & CIFS_NO_RESP) == 0)
+		midQ->resp_buf = NULL;
 out:
-	delete_mid(midQ);
-	atomic_dec(&ses->server->inFlight);
-	wake_up(&ses->server->request_q);
+	cifs_delete_mid(midQ);
+	add_credits(ses->server, credits, optype);
 
 	return rc;
 }
 
 int
-SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
+SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
-	    int *pbytes_returned, const int long_op)
+	    int *pbytes_returned, const int timeout)
 {
 	int rc = 0;
-	unsigned int receive_len;
 	struct mid_q_entry *midQ;
 
 	if (ses == NULL) {
-		cERROR(1, "Null smb session");
+		cifs_dbg(VFS, "Null smb session\n");
 		return -EIO;
 	}
 	if (ses->server == NULL) {
-		cERROR(1, "Null tcp session");
+		cifs_dbg(VFS, "Null tcp session\n");
 		return -EIO;
 	}
 
@@ -698,13 +844,14 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, "Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length);
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
+		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+			 be32_to_cpu(in_buf->smb_buf_length));
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses->server, long_op);
+	rc = wait_for_free_request(ses->server, timeout, 0);
 	if (rc)
 		return rc;
 
@@ -718,8 +865,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc) {
 		mutex_unlock(&ses->server->srv_mutex);
 		/* Update # of requests on wire to server */
-		atomic_dec(&ses->server->inFlight);
-		wake_up(&ses->server->request_q);
+		add_credits(ses->server, 1, 0);
 		return rc;
 	}
 
@@ -729,96 +875,57 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		goto out;
 	}
 
-	midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-	atomic_inc(&ses->server->inSend);
-#endif
-	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-#ifdef CONFIG_CIFS_STATS2
-	atomic_dec(&ses->server->inSend);
-	midQ->when_sent = jiffies;
-#endif
+	midQ->mid_state = MID_REQUEST_SUBMITTED;
+
+	cifs_in_send_inc(ses->server);
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	cifs_in_send_dec(ses->server);
+	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0)
 		goto out;
 
-	if (long_op == CIFS_ASYNC_OP)
+	if (timeout == CIFS_ASYNC_OP)
 		goto out;
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
-		send_nt_cancel(ses->server, in_buf, midQ);
+		send_cancel(ses->server, in_buf, midQ);
 		spin_lock(&GlobalMid_Lock);
-		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			/* no longer considered to be "in-flight" */
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
-			atomic_dec(&ses->server->inFlight);
-			wake_up(&ses->server->request_q);
+			add_credits(ses->server, 1, 0);
 			return rc;
 		}
 		spin_unlock(&GlobalMid_Lock);
 	}
 
-	rc = sync_mid_result(midQ, ses->server);
+	rc = cifs_sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
-		atomic_dec(&ses->server->inFlight);
-		wake_up(&ses->server->request_q);
+		add_credits(ses->server, 1, 0);
 		return rc;
 	}
 
-	receive_len = midQ->resp_buf->smb_buf_length;
-
-	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid);
+	if (!midQ->resp_buf || !out_buf ||
+	    midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
+		cifs_dbg(VFS, "Bad MID state?\n");
 		goto out;
 	}
 
-	/* rcvd frame is ok */
-
-	if (midQ->resp_buf && out_buf
-	    && (midQ->midState == MID_RESPONSE_RECEIVED)) {
-		out_buf->smb_buf_length = receive_len;
-		memcpy((char *)out_buf + 4,
-		       (char *)midQ->resp_buf + 4,
-		       receive_len);
-
-		dump_smb(out_buf, 92);
-		/* convert the length into a more usable form */
-		if ((receive_len > 24) &&
-		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-					     SECMODE_SIGN_ENABLED))) {
-			rc = cifs_verify_signature(out_buf,
-						ses->server,
-						midQ->sequence_number+1);
-			if (rc) {
-				cERROR(1, "Unexpected SMB signature");
-				/* BB FIXME add code to kill session */
-			}
-		}
-
-		*pbytes_returned = out_buf->smb_buf_length;
-
-		/* BB special case reconnect tid and uid here? */
-		rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-
-		/* convert ByteCount if necessary */
-		if (receive_len >= sizeof(struct smb_hdr) - 4
-		    /* do not count RFC1001 header */  +
-		    (2 * out_buf->WordCount) + 2 /* bcc */ )
-			put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
-	} else {
-		rc = -EIO;
-		cERROR(1, "Bad MID state?");
-	}
-
+	*pbytes_returned = get_rfc1002_length(midQ->resp_buf);
+	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+	rc = cifs_check_receive(midQ, ses->server, 0);
 out:
-	delete_mid(midQ);
-	atomic_dec(&ses->server->inFlight);
-	wake_up(&ses->server->request_q);
+	cifs_delete_mid(midQ);
+	add_credits(ses->server, 1, 0);
 
 	return rc;
 }
@@ -827,12 +934,12 @@ out:
    blocking lock to return. */
 
 static int
-send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
+send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
 			struct smb_hdr *in_buf,
 			struct smb_hdr *out_buf)
 {
 	int bytes_returned;
-	struct cifsSesInfo *ses = tcon->ses;
+	struct cifs_ses *ses = tcon->ses;
 	LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
 
 	/* We just modify the current in_buf to change
@@ -842,31 +949,30 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
 
 	pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
 	pSMB->Timeout = 0;
-	pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Mid = get_next_mid(ses->server);
 
 	return SendReceive(xid, ses, in_buf, out_buf,
 			&bytes_returned, 0);
 }
 
 int
-SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
+SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
 	    int *pbytes_returned)
 {
 	int rc = 0;
 	int rstart = 0;
-	unsigned int receive_len;
 	struct mid_q_entry *midQ;
-	struct cifsSesInfo *ses;
+	struct cifs_ses *ses;
 
 	if (tcon == NULL || tcon->ses == NULL) {
-		cERROR(1, "Null smb session");
+		cifs_dbg(VFS, "Null smb session\n");
 		return -EIO;
 	}
 	ses = tcon->ses;
 
 	if (ses->server == NULL) {
-		cERROR(1, "Null tcp session");
+		cifs_dbg(VFS, "Null tcp session\n");
 		return -EIO;
 	}
 
@@ -877,13 +983,14 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, "Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length);
+	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+			MAX_CIFS_HDR_SIZE - 4) {
+		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+			 be32_to_cpu(in_buf->smb_buf_length));
 		return -EIO;
 	}
 
-	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
+	rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
 	if (rc)
 		return rc;
 
@@ -901,45 +1008,45 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 	rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
 	if (rc) {
-		delete_mid(midQ);
+		cifs_delete_mid(midQ);
 		mutex_unlock(&ses->server->srv_mutex);
 		return rc;
 	}
 
-	midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-	atomic_inc(&ses->server->inSend);
-#endif
-	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-#ifdef CONFIG_CIFS_STATS2
-	atomic_dec(&ses->server->inSend);
-	midQ->when_sent = jiffies;
-#endif
+	midQ->mid_state = MID_REQUEST_SUBMITTED;
+	cifs_in_send_inc(ses->server);
+	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	cifs_in_send_dec(ses->server);
+	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0) {
-		delete_mid(midQ);
+		cifs_delete_mid(midQ);
 		return rc;
 	}
 
 	/* Wait for a reply - allow signals to interrupt. */
 	rc = wait_event_interruptible(ses->server->response_q,
-		(!(midQ->midState == MID_REQUEST_SUBMITTED)) ||
+		(!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
 		((ses->server->tcpStatus != CifsGood) &&
 		 (ses->server->tcpStatus != CifsNew)));
 
 	/* Were we interrupted by a signal ? */
 	if ((rc == -ERESTARTSYS) &&
-		(midQ->midState == MID_REQUEST_SUBMITTED) &&
+		(midQ->mid_state == MID_REQUEST_SUBMITTED) &&
 		((ses->server->tcpStatus == CifsGood) ||
 		 (ses->server->tcpStatus == CifsNew))) {
 
 		if (in_buf->Command == SMB_COM_TRANSACTION2) {
 			/* POSIX lock. We send a NT_CANCEL SMB to cause the
 			   blocking lock to return. */
-			rc = send_nt_cancel(ses->server, in_buf, midQ);
+			rc = send_cancel(ses->server, in_buf, midQ);
 			if (rc) {
-				delete_mid(midQ);
+				cifs_delete_mid(midQ);
 				return rc;
 			}
 		} else {
@@ -951,16 +1058,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			/* If we get -ENOLCK back the lock may have
 			   already been removed. Don't exit in this case. */
 			if (rc && rc != -ENOLCK) {
-				delete_mid(midQ);
+				cifs_delete_mid(midQ);
 				return rc;
 			}
 		}
 
 		rc = wait_for_response(ses->server, midQ);
 		if (rc) {
-			send_nt_cancel(ses->server, in_buf, midQ);
+			send_cancel(ses->server, in_buf, midQ);
 			spin_lock(&GlobalMid_Lock);
-			if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 				/* no longer considered to be "in-flight" */
 				midQ->callback = DeleteMidQEntry;
 				spin_unlock(&GlobalMid_Lock);
@@ -973,58 +1080,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		rstart = 1;
 	}
 
-	rc = sync_mid_result(midQ, ses->server);
+	rc = cifs_sync_mid_result(midQ, ses->server);
 	if (rc != 0)
 		return rc;
 
-	receive_len = midQ->resp_buf->smb_buf_length;
-	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid);
-		rc = -EIO;
-		goto out;
-	}
-
 	/* rcvd frame is ok */
-
-	if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
+	if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
-		cERROR(1, "Bad MID state?");
+		cifs_dbg(VFS, "Bad MID state?\n");
 		goto out;
 	}
 
-	out_buf->smb_buf_length = receive_len;
-	memcpy((char *)out_buf + 4,
-	       (char *)midQ->resp_buf + 4,
-	       receive_len);
-
-	dump_smb(out_buf, 92);
-	/* convert the length into a more usable form */
-	if ((receive_len > 24) &&
-	    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-				     SECMODE_SIGN_ENABLED))) {
-		rc = cifs_verify_signature(out_buf,
-					   ses->server,
-					   midQ->sequence_number+1);
-		if (rc) {
-			cERROR(1, "Unexpected SMB signature");
-			/* BB FIXME add code to kill session */
-		}
-	}
-
-	*pbytes_returned = out_buf->smb_buf_length;
-
-	/* BB special case reconnect tid and uid here? */
-	rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-
-	/* convert ByteCount if necessary */
-	if (receive_len >= sizeof(struct smb_hdr) - 4
-	    /* do not count RFC1001 header */  +
-	    (2 * out_buf->WordCount) + 2 /* bcc */ )
-		put_bcc(get_bcc_le(out_buf), out_buf);
-
+	*pbytes_returned = get_rfc1002_length(midQ->resp_buf);
+	memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+	rc = cifs_check_receive(midQ, ses->server, 0);
 out:
-	delete_mid(midQ);
+	cifs_delete_mid(midQ);
 	if (rstart && rc == -EACCES)
 		return -ERESTARTSYS;
 	return rc;
diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
new file mode 100644
index 00000000000..1506d4fddb2
--- /dev/null
+++ b/fs/cifs/winucase.c
@@ -0,0 +1,663 @@
+/*
+ * fs/cifs/winucase.c
+ *
+ * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * The const tables in this file were converted from the following info
+ * provided by Microsoft:
+ *
+ * 3.1.5.3 Mapping UTF-16 Strings to Upper Case:
+ *
+ * http://msdn.microsoft.com/en-us/library/hh877830.aspx
+ * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921
+ *
+ * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was
+ * post-processed using the winucase_convert.pl script.
+ */
+
+#include <linux/nls.h>
+
+wchar_t cifs_toupper(wchar_t in);  /* quiet sparse */
+
+static const wchar_t t2_00[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000,
+	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178,
+};
+
+static const wchar_t t2_01[256] = {
+	0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106,
+	0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e,
+	0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116,
+	0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e,
+	0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126,
+	0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e,
+	0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136,
+	0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000,
+	0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000,
+	0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e,
+	0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
+	0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e,
+	0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166,
+	0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e,
+	0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176,
+	0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000,
+	0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000,
+	0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000,
+	0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000,
+	0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000,
+	0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000,
+	0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000,
+	0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000,
+	0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000,
+	0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000,
+	0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de,
+	0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
+	0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
+	0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000,
+	0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe,
+};
+
+static const wchar_t t2_02[256] = {
+	0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206,
+	0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e,
+	0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216,
+	0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e,
+	0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226,
+	0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e,
+	0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246,
+	0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e,
+	0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a,
+	0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c,
+	0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000,
+	0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_03[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376,
+	0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a,
+	0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+	0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
+	0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf,
+	0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de,
+	0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6,
+	0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee,
+	0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_04[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
+	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
+	0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
+	0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,
+	0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466,
+	0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e,
+	0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476,
+	0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e,
+	0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e,
+	0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496,
+	0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e,
+	0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6,
+	0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae,
+	0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6,
+	0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be,
+	0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000,
+	0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0,
+	0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6,
+	0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de,
+	0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6,
+	0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee,
+	0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6,
+	0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe,
+};
+
+static const wchar_t t2_05[256] = {
+	0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506,
+	0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e,
+	0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516,
+	0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e,
+	0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537,
+	0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f,
+	0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547,
+	0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f,
+	0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_1d[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_1e[256] = {
+	0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06,
+	0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e,
+	0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16,
+	0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e,
+	0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26,
+	0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e,
+	0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36,
+	0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e,
+	0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46,
+	0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e,
+	0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56,
+	0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e,
+	0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66,
+	0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e,
+	0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76,
+	0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e,
+	0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86,
+	0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e,
+	0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6,
+	0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae,
+	0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6,
+	0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe,
+	0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6,
+	0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece,
+	0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6,
+	0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede,
+	0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6,
+	0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee,
+	0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6,
+	0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe,
+};
+
+static const wchar_t t2_1f[256] = {
+	0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb,
+	0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000,
+	0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_21[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167,
+	0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_24[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd,
+	0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5,
+	0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd,
+	0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_2c[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07,
+	0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f,
+	0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17,
+	0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f,
+	0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27,
+	0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000,
+	0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000,
+	0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86,
+	0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e,
+	0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96,
+	0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e,
+	0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6,
+	0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae,
+	0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6,
+	0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe,
+	0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6,
+	0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce,
+	0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6,
+	0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde,
+	0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_2d[256] = {
+	0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7,
+	0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af,
+	0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7,
+	0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf,
+	0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_a6[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646,
+	0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e,
+	0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656,
+	0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e,
+	0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666,
+	0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686,
+	0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e,
+	0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_a7[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726,
+	0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e,
+	0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736,
+	0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e,
+	0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746,
+	0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e,
+	0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756,
+	0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e,
+	0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766,
+	0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e,
+	0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786,
+	0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_ff[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27,
+	0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f,
+	0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37,
+	0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t *const toplevel[256] = {
+	t2_00, t2_01, t2_02, t2_03, t2_04, t2_05,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL, t2_1d, t2_1e, t2_1f,
+	NULL, t2_21,  NULL,  NULL, t2_24,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL, t2_2c, t2_2d,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL, t2_a6, t2_a7,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL, t2_ff,
+};
+
+/**
+ * cifs_toupper - convert a wchar_t from lower to uppercase
+ * @in: character to convert from lower to uppercase
+ *
+ * This function consults the static tables above to convert a wchar_t from
+ * lower to uppercase. In the event that there is no mapping, the original
+ * "in" character is returned.
+ */
+wchar_t
+cifs_toupper(wchar_t in)
+{
+	unsigned char idx;
+	const wchar_t *tbl;
+	wchar_t out;
+
+	/* grab upper byte */
+	idx = (in & 0xff00) >> 8;
+
+	/* find pointer to 2nd layer table */
+	tbl = toplevel[idx];
+	if (!tbl)
+		return in;
+
+	/* grab lower byte */
+	idx = in & 0xff;
+
+	/* look up character in table */
+	out = tbl[idx];
+	if (out)
+		return out;
+
+	return in;
+}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index eae2a149160..5ac836a86b1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -31,25 +32,17 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
-#define CIFS_XATTR_USER_PREFIX "user."
-#define CIFS_XATTR_SYSTEM_PREFIX "system."
-#define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX "security."
-#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN  8
-#define XATTR_SECURITY_PREFIX_LEN 9
-/* BB need to add server (Samba e.g) support for security and trusted prefix */
-
 
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
 
 int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 {
 	int rc = -EOPNOTSUPP;
 #ifdef CONFIG_CIFS_XATTR
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *pTcon;
 	struct super_block *sb;
 	char *full_path = NULL;
 
@@ -67,7 +60,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		return PTR_ERR(tlink);
 	pTcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -75,12 +68,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		goto remove_ea_exit;
 	}
 	if (ea_name == NULL) {
-		cFYI(1, "Null xattr names not supported");
-	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
-		&& (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
-		cFYI(1,
-		     "illegal xattr request %s (only user namespace supported)",
-		     ea_name);
+		cifs_dbg(FYI, "Null xattr names not supported\n");
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		&& (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
+		cifs_dbg(FYI,
+			 "illegal xattr request %s (only user namespace supported)\n",
+			 ea_name);
 		/* BB what if no namespace prefix? */
 		/* Should we just pass them to server, except for
 		system and perhaps security prefixes? */
@@ -88,14 +81,16 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto remove_ea_exit;
 
-		ea_name += 5; /* skip past user. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
-			(__u16)0, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, NULL, (__u16)0,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 remove_ea_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 #endif
 	return rc;
@@ -106,10 +101,10 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 {
 	int rc = -EOPNOTSUPP;
 #ifdef CONFIG_CIFS_XATTR
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *pTcon;
 	struct super_block *sb;
 	char *full_path;
 
@@ -127,7 +122,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		return PTR_ERR(tlink);
 	pTcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -141,31 +136,59 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		search server for EAs or streams to
 		returns as xattrs */
 	if (value_size > MAX_EA_VALUE_SIZE) {
-		cFYI(1, "size of EA value too large");
+		cifs_dbg(FYI, "size of EA value too large\n");
 		rc = -EOPNOTSUPP;
 		goto set_ea_exit;
 	}
 
 	if (ea_name == NULL) {
-		cFYI(1, "Null xattr names not supported");
-	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+		cifs_dbg(FYI, "Null xattr names not supported\n");
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-			cFYI(1, "attempt to set cifs inode metadata");
+			cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
 
-		ea_name += 5; /* skip past user. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
-			(__u16)value_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
+		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 
-		ea_name += 4; /* skip past os2. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
-			(__u16)value_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+			strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+		struct cifs_ntsd *pacl;
+		pacl = kmalloc(value_size, GFP_KERNEL);
+		if (!pacl) {
+			rc = -ENOMEM;
+		} else {
+			memcpy(pacl, ea_value, value_size);
+			if (pTcon->ses->server->ops->set_acl)
+				rc = pTcon->ses->server->ops->set_acl(pacl,
+						value_size, direntry->d_inode,
+						full_path, CIFS_ACL_DACL);
+			else
+				rc = -EOPNOTSUPP;
+			if (rc == 0) /* force revalidate of the inode */
+				CIFS_I(direntry->d_inode)->time = 0;
+			kfree(pacl);
+		}
+#else
+		cifs_dbg(FYI, "Set CIFS ACL not supported yet\n");
+#endif /* CONFIG_CIFS_ACL */
 	} else {
 		int temp;
 		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -178,9 +201,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_ACCESS, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, "set POSIX ACL rc %d", rc);
+			cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
 #else
-			cFYI(1, "set POSIX ACL not supported");
+			cifs_dbg(FYI, "set POSIX ACL not supported\n");
 #endif
 		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -191,13 +214,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_DEFAULT, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, "set POSIX default ACL rc %d", rc);
+			cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
 #else
-			cFYI(1, "set default POSIX ACL not supported");
+			cifs_dbg(FYI, "set default POSIX ACL not supported\n");
 #endif
 		} else {
-			cFYI(1, "illegal xattr request %s (only user namespace"
-				" supported)", ea_name);
+			cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n",
+				 ea_name);
 		  /* BB what if no namespace prefix? */
 		  /* Should we just pass them to server, except for
 		  system and perhaps security prefixes? */
@@ -206,7 +229,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 
 set_ea_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 #endif
 	return rc;
@@ -217,10 +240,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 {
 	ssize_t rc = -EOPNOTSUPP;
 #ifdef CONFIG_CIFS_XATTR
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *pTcon;
 	struct super_block *sb;
 	char *full_path;
 
@@ -238,7 +261,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 		return PTR_ERR(tlink);
 	pTcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -248,27 +271,32 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
 	if (ea_name == NULL) {
-		cFYI(1, "Null xattr names not supported");
-	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+		cifs_dbg(FYI, "Null xattr names not supported\n");
+	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-			cFYI(1, "attempt to query cifs inode metadata");
+			cifs_dbg(FYI, "attempt to query cifs inode metadata\n");
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
-		ea_name += 5; /* skip past user. prefix */
-		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
-			buf_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
-		ea_name += 4; /* skip past os2. prefix */
-		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
-			buf_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
 			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
@@ -279,7 +307,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-		cFYI(1, "Query POSIX ACL not supported yet");
+		cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
 #endif /* CONFIG_CIFS_POSIX */
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -291,7 +319,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-		cFYI(1, "Query POSIX default ACL not supported yet");
+		cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
 #endif /* CONFIG_CIFS_POSIX */
 	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
 				strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
@@ -299,12 +327,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			u32 acllen;
 			struct cifs_ntsd *pacl;
 
-			pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
-						full_path, &acllen);
+			if (pTcon->ses->server->ops->get_acl == NULL)
+				goto get_ea_exit; /* rc already EOPNOTSUPP */
+
+			pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
+					direntry->d_inode, full_path, &acllen);
 			if (IS_ERR(pacl)) {
 				rc = PTR_ERR(pacl);
-				cERROR(1, "%s: error %zd getting sec desc",
-						__func__, rc);
+				cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
+					 __func__, rc);
 			} else {
 				if (ea_value) {
 					if (acllen > buf_size)
@@ -316,18 +347,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				kfree(pacl);
 			}
 #else
-		cFYI(1, "Query CIFS ACL not supported yet");
+			cifs_dbg(FYI, "Query CIFS ACL not supported yet\n");
 #endif /* CONFIG_CIFS_ACL */
 	} else if (strncmp(ea_name,
-		  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-		cFYI(1, "Trusted xattr namespace not supported yet");
+		  XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+		cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n");
 	} else if (strncmp(ea_name,
-		  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-		cFYI(1, "Security xattr namespace not supported yet");
+		  XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+		cifs_dbg(FYI, "Security xattr namespace not supported yet\n");
 	} else
-		cFYI(1,
-		    "illegal xattr request %s (only user namespace supported)",
-		     ea_name);
+		cifs_dbg(FYI,
+			 "illegal xattr request %s (only user namespace supported)\n",
+			 ea_name);
 
 	/* We could add an additional check for streams ie
 	    if proc/fs/cifs/streamstoxattr is set then
@@ -339,7 +370,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 
 get_ea_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 #endif
 	return rc;
@@ -349,10 +380,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 {
 	ssize_t rc = -EOPNOTSUPP;
 #ifdef CONFIG_CIFS_XATTR
-	int xid;
+	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
-	struct cifsTconInfo *pTcon;
+	struct cifs_tcon *pTcon;
 	struct super_block *sb;
 	char *full_path;
 
@@ -373,7 +404,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 		return PTR_ERR(tlink);
 	pTcon = tlink_tcon(tlink);
 
-	xid = GetXid();
+	xid = get_xid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -386,14 +417,15 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 	/* if proc/fs/cifs/streamstoxattr is set then
 		search server for EAs or streams to
 		returns as xattrs */
-	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
-				buf_size, cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+	if (pTcon->ses->server->ops->query_all_EAs)
+		rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, NULL, data, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 list_ea_exit:
 	kfree(full_path);
-	FreeXid(xid);
+	free_xid(xid);
 	cifs_put_tlink(tlink);
 #endif
 	return rc;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 69015787618..1da168c61d3 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask)
 
 	spin_lock(&cii->c_lock);
 	cii->c_cached_epoch = atomic_read(&permission_epoch);
-	if (cii->c_uid != current_fsuid()) {
+	if (!uid_eq(cii->c_uid, current_fsuid())) {
 		cii->c_uid = current_fsuid();
                 cii->c_cached_perm = mask;
         } else
@@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask)
 	
 	spin_lock(&cii->c_lock);
 	hit = (mask & cii->c_cached_perm) == mask &&
-	    cii->c_uid == current_fsuid() &&
+	    uid_eq(cii->c_uid, current_fsuid()) &&
 	    cii->c_cached_epoch == atomic_read(&permission_epoch);
 	spin_unlock(&cii->c_lock);
 
@@ -89,17 +89,13 @@ int coda_cache_check(struct inode *inode, int mask)
 /* this won't do any harm: just flag all children */
 static void coda_flag_children(struct dentry *parent, int flag)
 {
-	struct list_head *child;
 	struct dentry *de;
 
 	spin_lock(&parent->d_lock);
-	list_for_each(child, &parent->d_subdirs)
-	{
-		de = list_entry(child, struct dentry, d_u.d_child);
+	list_for_each_entry(de, &parent->d_subdirs, d_u.d_child) {
 		/* don't know what to do with negative dentries */
-		if ( ! de->d_inode ) 
-			continue;
-		coda_flag_inode(de->d_inode, flag);
+		if (de->d_inode ) 
+			coda_flag_inode(de->d_inode, flag);
 	}
 	spin_unlock(&parent->d_lock);
 	return; 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b076..7740b1c871c 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
    - link the two up if this is needed
    - fill in the attributes
 */
-int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
 {
         struct coda_vattr attr;
+	struct inode *inode;
         int error;
         
 	/* We get inode numbers from Venus -- see venus source */
 	error = venus_getattr(sb, fid, &attr);
-	if ( error ) {
-	    *inode = NULL;
-	    return error;
-	} 
-
-	*inode = coda_iget(sb, fid, &attr);
-	if ( IS_ERR(*inode) ) {
-		printk("coda_cnode_make: coda_iget failed\n");
-                return PTR_ERR(*inode);
-        }
-	return 0;
+	if (error)
+		return ERR_PTR(error);
+
+	inode = coda_iget(sb, fid, &attr);
+	if (IS_ERR(inode))
+		pr_warn("%s: coda_iget failed\n", __func__);
+	return inode;
 }
 
 
@@ -140,7 +137,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
 	unsigned long hash = coda_f2i(fid);
 
 	if ( !sb ) {
-		printk("coda_fid_to_inode: no sb!\n");
+		pr_warn("%s: no sb!\n", __func__);
 		return NULL;
 	}
 
@@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
 }
 
 /* the CONTROL inode is made without asking attributes from Venus */
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+struct inode *coda_cnode_makectl(struct super_block *sb)
 {
-	int error = -ENOMEM;
-
-	*inode = new_inode(sb);
-	if (*inode) {
-		(*inode)->i_ino = CTL_INO;
-		(*inode)->i_op = &coda_ioctl_inode_operations;
-		(*inode)->i_fop = &coda_ioctl_operations;
-		(*inode)->i_mode = 0444;
-		error = 0;
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = CTL_INO;
+		inode->i_op = &coda_ioctl_inode_operations;
+		inode->i_fop = &coda_ioctl_operations;
+		inode->i_mode = 0444;
+		return inode;
 	}
-
-	return error;
+	return ERR_PTR(-ENOMEM);
 }
 
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0..c6407521321 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -25,7 +25,7 @@ struct coda_inode_info {
 	u_short	           c_flags;     /* flags (see below) */
 	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
 	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
-	vuid_t		   c_uid;	/* fsuid for cached permissions */
+	kuid_t		   c_uid;	/* fsuid for cached permissions */
 	unsigned int       c_cached_perm; /* cached access permissions */
 	spinlock_t	   c_lock;
 	struct inode	   vfs_inode;
@@ -49,9 +49,9 @@ struct coda_file_info {
 #define C_DYING       0x4   /* from venus (which died) */
 #define C_PURGE       0x8
 
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
 void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
 
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 6b443ff43a1..381c993b142 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -10,8 +10,8 @@ extern int coda_hard;
 extern int coda_fake_statfs;
 
 void coda_destroy_inodecache(void);
-int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, int datasync);
+int __init coda_init_inodecache(void);
+int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
 
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2bdbcc11b37..2849f41e72a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -100,11 +100,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	if (attr->va_mode != (u_short) -1)
 	        inode->i_mode = attr->va_mode | inode_type;
         if (attr->va_uid != -1) 
-	        inode->i_uid = (uid_t) attr->va_uid;
+	        inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid);
         if (attr->va_gid != -1)
-	        inode->i_gid = (gid_t) attr->va_gid;
+	        inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid);
 	if (attr->va_nlink != -1)
-	        inode->i_nlink = attr->va_nlink;
+		set_nlink(inode, attr->va_nlink);
 	if (attr->va_size != -1)
 	        inode->i_size = attr->va_size;
 	if (attr->va_size != -1)
@@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
                 vattr->va_mode = iattr->ia_mode;
 	}
         if ( valid & ATTR_UID ) {
-                vattr->va_uid = (vuid_t) iattr->ia_uid;
+                vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid);
 	}
         if ( valid & ATTR_GID ) {
-                vattr->va_gid = (vgid_t) iattr->ia_gid;
+                vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid);
 	}
         if ( valid & ATTR_SIZE ) {
                 vattr->va_size = iattr->ia_size;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 9b0c5323890..d42b725b1d2 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -12,6 +12,12 @@
 #ifndef _LINUX_CODA_FS
 #define _LINUX_CODA_FS
 
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/mm.h>
@@ -39,8 +45,8 @@ extern const struct file_operations coda_ioctl_operations;
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask, unsigned int flags);
-int coda_revalidate_inode(struct dentry *);
+int coda_permission(struct inode *inode, int mask);
+int coda_revalidate_inode(struct inode *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
 
@@ -59,12 +65,11 @@ void coda_sysctl_clean(void);
 
 #define CODA_ALLOC(ptr, cast, size) do { \
     if (size < PAGE_SIZE) \
-        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+        ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
     else \
-        ptr = (cast)vmalloc((unsigned long) size); \
+        ptr = (cast)vzalloc((unsigned long) size); \
     if (!ptr) \
-        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
-    else memset( ptr, 0, size ); \
+	pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
 } while (0)
 
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121..cd8a63238b1 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -30,28 +30,27 @@
 #include "coda_int.h"
 
 /* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd);
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
+static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl);
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
 static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
 		     struct dentry *entry);
 static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
 static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
 			const char *symname);
-static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode);
+static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
 static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
 static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, 
                        struct inode *new_inode, struct dentry *new_dentry);
 
 /* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
 
 /* dentry ops */
-static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
+static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
 static int coda_dentry_delete(const struct dentry *);
 
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
-			      filldir_t filldir);
+static int coda_venus_readdir(struct file *, struct dir_context *);
 
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
 const struct file_operations coda_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= coda_readdir,
+	.iterate	= coda_readdir,
 	.open		= coda_open,
 	.release	= coda_release,
 	.fsync		= coda_fsync,
@@ -94,49 +93,46 @@ const struct file_operations coda_dir_operations = {
 
 /* inode operations for directories */
 /* access routines: lookup, readlink, permission */
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
 {
-	struct inode *inode = NULL;
-	struct CodaFid resfid = { { 0, } };
-	int type = 0;
-	int error = 0;
+	struct super_block *sb = dir->i_sb;
 	const char *name = entry->d_name.name;
 	size_t length = entry->d_name.len;
+	struct inode *inode;
+	int type = 0;
 
 	if (length > CODA_MAXNAMLEN) {
-		printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
+		pr_err("name too long: lookup, %s (%*s)\n",
 		       coda_i2s(dir), (int)length, name);
 		return ERR_PTR(-ENAMETOOLONG);
 	}
 
 	/* control object, create inode on the fly */
 	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		error = coda_cnode_makectl(&inode, dir->i_sb);
+		inode = coda_cnode_makectl(sb);
 		type = CODA_NOCACHE;
-		goto exit;
+	} else {
+		struct CodaFid fid = { { 0, } };
+		int error = venus_lookup(sb, coda_i2f(dir), name, length,
+				     &type, &fid);
+		inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
 	}
 
-	error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
-			     &type, &resfid);
-	if (!error)
-		error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-
-	if (error && error != -ENOENT)
-		return ERR_PTR(error);
-
-exit:
-	if (inode && (type & CODA_NOCACHE))
+	if (!IS_ERR(inode) && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
+	if (inode == ERR_PTR(-ENOENT))
+		inode = NULL;
+
 	return d_splice_alias(inode, entry);
 }
 
 
-int coda_permission(struct inode *inode, int mask, unsigned int flags)
+int coda_permission(struct inode *inode, int mask)
 {
 	int error;
 
-	if (flags & IPERM_FLAG_RCU)
+	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
@@ -191,7 +187,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 }
 
 /* creation routines: create, mknod, mkdir, link, symlink */
-static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
+static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl)
 {
 	int error;
 	const char *name=de->d_name.name;
@@ -223,7 +219,7 @@ err_out:
 	return error;
 }
 
-static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
+static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
 {
 	struct inode *inode;
 	struct coda_vattr attrs;
@@ -340,7 +336,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	if (!error) {
 		/* VFS may delete the child */
 		if (de->d_inode)
-		    de->d_inode->i_nlink = 0;
+			clear_nlink(de->d_inode);
 
 		/* fix the link count of the parent */
 		coda_dir_drop_nlink(dir);
@@ -381,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 
 /* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 {
 	struct coda_file_info *cfi;
 	struct file *host_file;
@@ -391,33 +387,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op)
-		return -ENOTDIR;
-
-	if (host_file->f_op->readdir)
-	{
-		/* potemkin case: we were handed a directory inode.
-		 * We can't use vfs_readdir because we have to keep the file
-		 * position in sync between the coda_file and the host_file.
-		 * and as such we need grab the inode mutex. */
-		struct inode *host_inode = host_file->f_path.dentry->d_inode;
-
+	if (host_file->f_op->iterate) {
+		struct inode *host_inode = file_inode(host_file);
 		mutex_lock(&host_inode->i_mutex);
-		host_file->f_pos = coda_file->f_pos;
-
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
-			ret = host_file->f_op->readdir(host_file, buf, filldir);
+			ret = host_file->f_op->iterate(host_file, ctx);
 			file_accessed(host_file);
 		}
-
-		coda_file->f_pos = host_file->f_pos;
 		mutex_unlock(&host_inode->i_mutex);
+		return ret;
 	}
-	else /* Venus: we must read Venus dirents from a file */
-		ret = coda_venus_readdir(coda_file, buf, filldir);
-
-	return ret;
+	/* Venus: we must read Venus dirents from a file */
+	return coda_venus_readdir(coda_file, ctx);
 }
 
 static inline unsigned int CDT2DT(unsigned char cdt)
@@ -440,17 +422,14 @@ static inline unsigned int CDT2DT(unsigned char cdt)
 }
 
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
-			      filldir_t filldir)
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
 {
-	int result = 0; /* # of entries returned */
 	struct coda_file_info *cfi;
 	struct coda_inode_info *cii;
 	struct file *host_file;
 	struct dentry *de;
 	struct venus_dirent *vdir;
-	unsigned long vdir_size =
-	    (unsigned long)(&((struct venus_dirent *)0)->d_name);
+	unsigned long vdir_size = offsetof(struct venus_dirent, d_name);
 	unsigned int type;
 	struct qstr name;
 	ino_t ino;
@@ -466,42 +445,31 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
 	vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
 	if (!vdir) return -ENOMEM;
 
-	if (coda_file->f_pos == 0) {
-		ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
-		if (ret < 0)
-			goto out;
-		result++;
-		coda_file->f_pos++;
-	}
-	if (coda_file->f_pos == 1) {
-		ret = filldir(buf, "..", 2, 1, de->d_parent->d_inode->i_ino, DT_DIR);
-		if (ret < 0)
-			goto out;
-		result++;
-		coda_file->f_pos++;
-	}
+	if (!dir_emit_dots(coda_file, ctx))
+		goto out;
+
 	while (1) {
 		/* read entries from the directory file */
-		ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+		ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
 				  sizeof(*vdir));
 		if (ret < 0) {
-			printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
-			       coda_f2s(&cii->c_fid), ret);
+			pr_err("%s: read dir %s failed %d\n",
+			       __func__, coda_f2s(&cii->c_fid), ret);
 			break;
 		}
 		if (ret == 0) break; /* end of directory file reached */
 
 		/* catch truncated reads */
 		if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) {
-			printk(KERN_ERR "coda readdir: short read on %s\n",
-			       coda_f2s(&cii->c_fid));
+			pr_err("%s: short read on %s\n",
+			       __func__, coda_f2s(&cii->c_fid));
 			ret = -EBADF;
 			break;
 		}
 		/* validate whether the directory file actually makes sense */
 		if (vdir->d_reclen < vdir_size + vdir->d_namlen) {
-			printk(KERN_ERR "coda readdir: invalid dir %s\n",
-			       coda_f2s(&cii->c_fid));
+			pr_err("%s: invalid dir %s\n",
+			       __func__, coda_f2s(&cii->c_fid));
 			ret = -EBADF;
 			break;
 		}
@@ -511,41 +479,32 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
 
 		/* Make sure we skip '.' and '..', we already got those */
 		if (name.name[0] == '.' && (name.len == 1 ||
-		    (vdir->d_name[1] == '.' && name.len == 2)))
+		    (name.name[1] == '.' && name.len == 2)))
 			vdir->d_fileno = name.len = 0;
 
 		/* skip null entries */
 		if (vdir->d_fileno && name.len) {
-			/* try to look up this entry in the dcache, that way
-			 * userspace doesn't have to worry about breaking
-			 * getcwd by having mismatched inode numbers for
-			 * internal volume mountpoints. */
-			ino = find_inode_number(de, &name);
-			if (!ino) ino = vdir->d_fileno;
-
+			ino = vdir->d_fileno;
 			type = CDT2DT(vdir->d_type);
-			ret = filldir(buf, name.name, name.len,
-				      coda_file->f_pos, ino, type);
-			/* failure means no space for filling in this round */
-			if (ret < 0) break;
-			result++;
+			if (!dir_emit(ctx, name.name, name.len, ino, type))
+				break;
 		}
 		/* we'll always have progress because d_reclen is unsigned and
 		 * we've already established it is non-zero. */
-		coda_file->f_pos += vdir->d_reclen;
+		ctx->pos += vdir->d_reclen;
 	}
 out:
 	kfree(vdir);
-	return result ? result : ret;
+	return 0;
 }
 
 /* called when a cache lookup succeeds */
-static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
+static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
 {
 	struct inode *inode;
 	struct coda_inode_info *cii;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = de->d_inode;
@@ -564,7 +523,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 	if (cii->c_flags & C_FLUSH) 
 		coda_flag_inode_children(inode, C_FLUSH);
 
-	if (de->d_count > 1)
+	if (d_count(de) > 1)
 		/* pretend it's valid, but don't change the flags */
 		goto out;
 
@@ -604,13 +563,12 @@ static int coda_dentry_delete(const struct dentry * dentry)
  * cache manager Venus issues a downcall to the kernel when this 
  * happens 
  */
-int coda_revalidate_inode(struct dentry *dentry)
+int coda_revalidate_inode(struct inode *inode)
 {
 	struct coda_vattr attr;
 	int error;
 	int old_mode;
 	ino_t old_ino;
-	struct inode *inode = dentry->d_inode;
 	struct coda_inode_info *cii = ITOC(inode);
 
 	if (!cii->c_flags)
@@ -631,8 +589,8 @@ int coda_revalidate_inode(struct dentry *dentry)
 		coda_vattr_to_iattr(inode, &attr);
 
 		if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) {
-			printk("Coda: inode %ld, fid %s changed type!\n",
-			       inode->i_ino, coda_f2s(&(cii->c_fid)));
+			pr_warn("inode %ld, fid %s changed type!\n",
+				inode->i_ino, coda_f2s(&(cii->c_fid)));
 		}
 
 		/* the following can happen when a local fid is replaced 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 0433057be33..9e83b779021 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -36,7 +36,7 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->read)
+	if (!host_file->f_op->read)
 		return -EINVAL;
 
 	return host_file->f_op->read(host_file, buf, count, ppos);
@@ -66,7 +66,7 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 static ssize_t
 coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos)
 {
-	struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode;
+	struct inode *host_inode, *coda_inode = file_inode(coda_file);
 	struct coda_file_info *cfi;
 	struct file *host_file;
 	ssize_t ret;
@@ -75,10 +75,11 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->write)
+	if (!host_file->f_op->write)
 		return -EINVAL;
 
-	host_inode = host_file->f_path.dentry->d_inode;
+	host_inode = file_inode(host_file);
+	file_start_write(host_file);
 	mutex_lock(&coda_inode->i_mutex);
 
 	ret = host_file->f_op->write(host_file, buf, count, ppos);
@@ -87,6 +88,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
 	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
 	mutex_unlock(&coda_inode->i_mutex);
+	file_end_write(host_file);
 
 	return ret;
 }
@@ -103,11 +105,11 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->mmap)
+	if (!host_file->f_op->mmap)
 		return -ENODEV;
 
-	coda_inode = coda_file->f_path.dentry->d_inode;
-	host_inode = host_file->f_path.dentry->d_inode;
+	coda_inode = file_inode(coda_file);
+	host_inode = file_inode(host_file);
 
 	cii = ITOC(coda_inode);
 	spin_lock(&cii->c_lock);
@@ -178,7 +180,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
 			  coda_flags, coda_file->f_cred->fsuid);
 
-	host_inode = cfi->cfi_container->f_path.dentry->d_inode;
+	host_inode = file_inode(cfi->cfi_container);
 	cii = ITOC(coda_inode);
 
 	/* did we mmap this file? */
@@ -199,10 +201,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_fsync(struct file *coda_file, int datasync)
+int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 {
 	struct file *host_file;
-	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
+	struct inode *coda_inode = file_inode(coda_file);
 	struct coda_file_info *cfi;
 	int err;
 
@@ -210,6 +212,11 @@ int coda_fsync(struct file *coda_file, int datasync)
 	      S_ISLNK(coda_inode->i_mode)))
 		return -EINVAL;
 
+	err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&coda_inode->i_mutex);
+
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
@@ -217,6 +224,7 @@ int coda_fsync(struct file *coda_file, int datasync)
 	err = vfs_fsync(host_file, datasync);
 	if (!err && !datasync)
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
+	mutex_unlock(&coda_inode->i_mutex);
 
 	return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 871b2771546..fe3afb2de88 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -20,8 +20,8 @@
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
+#include <linux/pid_namespace.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 
 #include <linux/fs.h>
@@ -49,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 		return NULL;
 	memset(&ei->c_fid, 0, sizeof(struct CodaFid));
 	ei->c_flags = 0;
-	ei->c_uid = 0;
+	ei->c_uid = GLOBAL_ROOT_UID;
 	ei->c_cached_perm = 0;
 	spin_lock_init(&ei->c_lock);
 	return &ei->vfs_inode;
@@ -58,7 +58,6 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 static void coda_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
@@ -74,7 +73,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-int coda_init_inodecache(void)
+int __init coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
 				sizeof(struct coda_inode_info),
@@ -87,11 +86,17 @@ int coda_init_inodecache(void)
 
 void coda_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(coda_inode_cachep);
 }
 
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_NOATIME;
 	return 0;
 }
@@ -109,43 +114,41 @@ static const struct super_operations coda_super_operations =
 
 static int get_device_index(struct coda_mount_data *data)
 {
-	struct file *file;
+	struct fd f;
 	struct inode *inode;
 	int idx;
 
-	if(data == NULL) {
-		printk("coda_read_super: Bad mount data\n");
+	if (data == NULL) {
+		pr_warn("%s: Bad mount data\n", __func__);
 		return -1;
 	}
 
-	if(data->version != CODA_MOUNT_VERSION) {
-		printk("coda_read_super: Bad mount version\n");
+	if (data->version != CODA_MOUNT_VERSION) {
+		pr_warn("%s: Bad mount version\n", __func__);
 		return -1;
 	}
 
-	file = fget(data->fd);
-	inode = NULL;
-	if(file)
-		inode = file->f_path.dentry->d_inode;
-	
-	if(!inode || !S_ISCHR(inode->i_mode) ||
-	   imajor(inode) != CODA_PSDEV_MAJOR) {
-		if(file)
-			fput(file);
-
-		printk("coda_read_super: Bad file\n");
-		return -1;
+	f = fdget(data->fd);
+	if (!f.file)
+		goto Ebadf;
+	inode = file_inode(f.file);
+	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
+		fdput(f);
+		goto Ebadf;
 	}
 
 	idx = iminor(inode);
-	fput(file);
+	fdput(f);
 
-	if(idx < 0 || idx >= MAX_CODADEVS) {
-		printk("coda_read_super: Bad minor number\n");
+	if (idx < 0 || idx >= MAX_CODADEVS) {
+		pr_warn("%s: Bad minor number\n", __func__);
 		return -1;
 	}
 
 	return idx;
+Ebadf:
+	pr_warn("%s: Bad file\n", __func__);
+	return -1;
 }
 
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
@@ -156,25 +159,28 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	int error;
 	int idx;
 
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
 	idx = get_device_index((struct coda_mount_data *) data);
 
 	/* Ignore errors in data, for backward compatibility */
 	if(idx == -1)
 		idx = 0;
 	
-	printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
+	pr_info("%s: device index: %i\n", __func__,  idx);
 
 	vc = &coda_comms[idx];
 	mutex_lock(&vc->vc_mutex);
 
 	if (!vc->vc_inuse) {
-		printk("coda_read_super: No pseudo device\n");
+		pr_warn("%s: No pseudo device\n", __func__);
 		error = -EINVAL;
 		goto unlock_out;
 	}
 
 	if (vc->vc_sb) {
-		printk("coda_read_super: Device already mounted\n");
+		pr_warn("%s: Device already mounted\n", __func__);
 		error = -EBUSY;
 		goto unlock_out;
 	}
@@ -198,22 +204,24 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	/* get root fid from Venus: this needs the root inode */
 	error = venus_rootfid(sb, &fid);
 	if ( error ) {
-	        printk("coda_read_super: coda_get_rootfid failed with %d\n",
-		       error);
+		pr_warn("%s: coda_get_rootfid failed with %d\n",
+			__func__, error);
 		goto error;
 	}
-	printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
+	pr_info("%s: rootfid is %s\n", __func__, coda_f2s(&fid));
 	
 	/* make root inode */
-        error = coda_cnode_make(&root, &fid, sb);
-        if ( error || !root ) {
-	    printk("Failure of coda_cnode_make for root: error %d\n", error);
-	    goto error;
+        root = coda_cnode_make(&fid, sb);
+        if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		pr_warn("Failure of coda_cnode_make for root: error %d\n",
+			error);
+		goto error;
 	} 
 
-	printk("coda_read_super: rootinode is %ld dev %s\n", 
-	       root->i_ino, root->i_sb->s_id);
-	sb->s_root = d_alloc_root(root);
+	pr_info("%s: rootinode is %ld dev %s\n",
+		__func__, root->i_ino, root->i_sb->s_id);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		error = -EINVAL;
 		goto error;
@@ -221,9 +229,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 error:
-	if (root)
-		iput(root);
-
 	mutex_lock(&vc->vc_mutex);
 	bdi_destroy(&vc->bdi);
 	vc->vc_sb = NULL;
@@ -242,19 +247,19 @@ static void coda_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 	mutex_unlock(&vcp->vc_mutex);
 
-	printk("Coda: Bye bye.\n");
+	pr_info("Bye bye.\n");
 }
 
 static void coda_evict_inode(struct inode *inode)
 {
-	truncate_inode_pages(&inode->i_data, 0);
-	end_writeback(inode);
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
 	coda_cache_clear_inode(inode);
 }
 
 int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
-	int err = coda_revalidate_inode(dentry);
+	int err = coda_revalidate_inode(dentry->d_inode);
 	if (!err)
 		generic_fillattr(dentry->d_inode, stat);
 	return err;
@@ -326,4 +331,5 @@ struct file_system_type coda_fs_type = {
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
+MODULE_ALIAS_FS("coda");
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 6cbb3afb36d..3f5de96bbb5 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@
 #include "coda_linux.h"
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
+static int coda_ioctl_permission(struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
 			unsigned long user_data);
 
@@ -41,10 +41,8 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
+static int coda_ioctl_permission(struct inode *inode, int mask)
 {
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
@@ -54,7 +52,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 	struct path path;
 	int error;
 	struct PioctlData data;
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct inode *target_inode = NULL;
 	struct coda_inode_info *cnp;
 
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 8f616e0e252..5c1e4242368 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -37,8 +37,8 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/pid_namespace.h>
 #include <asm/io.h>
-#include <asm/system.h>
 #include <asm/poll.h>
 #include <asm/uaccess.h>
 
@@ -114,14 +114,14 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		int size = sizeof(*dcbuf);
 
 		if  ( nbytes < sizeof(struct coda_out_hdr) ) {
-		        printk("coda_downcall opc %d uniq %d, not enough!\n",
-			       hdr.opcode, hdr.unique);
+			pr_warn("coda_downcall opc %d uniq %d, not enough!\n",
+				hdr.opcode, hdr.unique);
 			count = nbytes;
 			goto out;
 		}
 		if ( nbytes > size ) {
-		        printk("Coda: downcall opc %d, uniq %d, too much!",
-			       hdr.opcode, hdr.unique);
+			pr_warn("downcall opc %d, uniq %d, too much!",
+				hdr.opcode, hdr.unique);
 		        nbytes = size;
 		}
 		CODA_ALLOC(dcbuf, union outputArgs *, nbytes);
@@ -136,7 +136,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 
 		CODA_FREE(dcbuf, nbytes);
 		if (error) {
-		        printk("psdev_write: coda_downcall error: %d\n", error);
+			pr_warn("%s: coda_downcall error: %d\n",
+				__func__, error);
 			retval = error;
 			goto out;
 		}
@@ -157,16 +158,17 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	mutex_unlock(&vcp->vc_mutex);
 
 	if (!req) {
-		printk("psdev_write: msg (%d, %d) not found\n", 
-			hdr.opcode, hdr.unique);
+		pr_warn("%s: msg (%d, %d) not found\n",
+			__func__, hdr.opcode, hdr.unique);
 		retval = -ESRCH;
 		goto out;
 	}
 
         /* move data into response buffer. */
 	if (req->uc_outSize < nbytes) {
-                printk("psdev_write: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n",
-		       req->uc_outSize, (long)nbytes, hdr.opcode, hdr.unique);
+		pr_warn("%s: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n",
+			__func__, req->uc_outSize, (long)nbytes,
+			hdr.opcode, hdr.unique);
 		nbytes = req->uc_outSize; /* don't have more space! */
 	}
         if (copy_from_user(req->uc_data, buf, nbytes)) {
@@ -240,8 +242,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	/* Move the input args into userspace */
 	count = req->uc_inSize;
 	if (nbytes < req->uc_inSize) {
-                printk ("psdev_read: Venus read %ld bytes of %d in message\n",
-			(long)nbytes, req->uc_inSize);
+		pr_warn("%s: Venus read %ld bytes of %d in message\n",
+			__func__, (long)nbytes, req->uc_inSize);
 		count = nbytes;
         }
 
@@ -267,6 +269,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
 	struct venus_comm *vcp;
 	int idx, err;
 
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
+	if (current_user_ns() != &init_user_ns)
+		return -EINVAL;
+
 	idx = iminor(inode);
 	if (idx < 0 || idx >= MAX_CODADEVS)
 		return -ENODEV;
@@ -299,7 +307,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 	struct upc_req *req, *tmp;
 
 	if (!vcp || !vcp->vc_inuse ) {
-		printk("psdev_release: Not open.\n");
+		pr_warn("%s: Not open.\n", __func__);
 		return -1;
 	}
 
@@ -348,8 +356,8 @@ static int init_coda_psdev(void)
 {
 	int i, err = 0;
 	if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) {
-              printk(KERN_ERR "coda_psdev: unable to get major %d\n", 
-		     CODA_PSDEV_MAJOR);
+		pr_err("%s: unable to get major %d\n",
+		       __func__, CODA_PSDEV_MAJOR);
               return -EIO;
 	}
 	coda_psdev_class = class_create(THIS_MODULE, "coda");
@@ -387,13 +395,13 @@ static int __init init_coda(void)
 		goto out2;
 	status = init_coda_psdev();
 	if ( status ) {
-		printk("Problem (%d) in init_coda_psdev\n", status);
+		pr_warn("Problem (%d) in init_coda_psdev\n", status);
 		goto out1;
 	}
 	
 	status = register_filesystem(&coda_fs_type);
 	if (status) {
-		printk("coda: failed to register filesystem!\n");
+		pr_warn("failed to register filesystem!\n");
 		goto out;
 	}
 	return 0;
@@ -414,9 +422,8 @@ static void __exit exit_coda(void)
         int err, i;
 
 	err = unregister_filesystem(&coda_fs_type);
-        if ( err != 0 ) {
-                printk("coda: failed to unregister filesystem\n");
-        }
+	if (err != 0)
+		pr_warn("failed to unregister filesystem\n");
 	for (i = 0; i < MAX_CODADEVS; i++)
 		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
 	class_destroy(coda_psdev_class);
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 06d27a41807..34218a8a28c 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -14,7 +14,7 @@
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
 
-static ctl_table coda_table[] = {
+static struct ctl_table coda_table[] = {
 	{
 		.procname	= "timeout",
 		.data		= &coda_timeout,
@@ -39,7 +39,7 @@ static ctl_table coda_table[] = {
 	{}
 };
 
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
 	{
 		.procname	= "coda",
 		.mode		= 0555,
@@ -61,4 +61,13 @@ void coda_sysctl_clean(void)
 		fs_table_header = NULL;
 	}
 }
+
+#else
+void coda_sysctl_init(void)
+{
+}
+
+void coda_sysctl_clean(void)
+{
+}
 #endif
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9727e0c5257..21fcf8dcb9c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -14,7 +14,6 @@
  * improvements to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
  */
 
-#include <asm/system.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/types.h>
@@ -51,9 +50,9 @@ static void *alloc_upcall(int opcode, int size)
 		return ERR_PTR(-ENOMEM);
 
         inp->ih.opcode = opcode;
-	inp->ih.pid = current->pid;
-	inp->ih.pgid = task_pgrp_nr(current);
-	inp->ih.uid = current_fsuid();
+	inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns);
+	inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns);
+	inp->ih.uid = from_kuid(&init_user_ns, current_fsuid());
 
 	return (void*)inp;
 }
@@ -158,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,
 }
 
 int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
-		vuid_t uid)
+		kuid_t uid)
 {
 	union inputArgs *inp;
 	union outputArgs *outp;
@@ -167,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
 	insize = SIZE(release);
 	UPARG(CODA_CLOSE);
 	
-	inp->ih.uid = uid;
+	inp->ih.uid = from_kuid(&init_user_ns, uid);
         inp->coda_close.VFid = *fid;
         inp->coda_close.flags = flags;
 
@@ -509,8 +508,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
         inp->coda_ioctl.data = (char *)(INSIZE(ioctl));
      
         /* get the data out of user space */
-        if ( copy_from_user((char*)inp + (long)inp->coda_ioctl.data,
-			    data->vi.in, data->vi.in_size) ) {
+	if (copy_from_user((char *)inp + (long)inp->coda_ioctl.data,
+			   data->vi.in, data->vi.in_size)) {
 		error = -EINVAL;
 	        goto exit;
 	}
@@ -519,8 +518,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 			    &outsize, inp);
 
         if (error) {
-	        printk("coda_pioctl: Venus returns: %d for %s\n", 
-		       error, coda_f2s(fid));
+		pr_warn("%s: Venus returns: %d for %s\n",
+			__func__, error, coda_f2s(fid));
 		goto exit; 
 	}
 
@@ -676,7 +675,7 @@ static int coda_upcall(struct venus_comm *vcp,
 	mutex_lock(&vcp->vc_mutex);
 
 	if (!vcp->vc_inuse) {
-		printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
+		pr_notice("Venus dead, not sending upcall\n");
 		error = -ENXIO;
 		goto exit;
 	}
@@ -726,7 +725,7 @@ static int coda_upcall(struct venus_comm *vcp,
 
 	error = -EINTR;
 	if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
-		printk(KERN_WARNING "coda: Unexpected interruption.\n");
+		pr_warn("Unexpected interruption.\n");
 		goto exit;
 	}
 
@@ -736,7 +735,7 @@ static int coda_upcall(struct venus_comm *vcp,
 
 	/* Venus saw the upcall, make sure we can send interrupt signal */
 	if (!vcp->vc_inuse) {
-		printk(KERN_INFO "coda: Venus dead, not sending signal.\n");
+		pr_info("Venus dead, not sending signal.\n");
 		goto exit;
 	}
 
diff --git a/fs/compat.c b/fs/compat.c
index 72fe6cda910..66d3d3c6b4b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -33,11 +33,9 @@
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
-#include <linux/module.h>
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
 #include <linux/tsacct_kern.h>
@@ -46,10 +44,10 @@
 #include <linux/signal.h>
 #include <linux/poll.h>
 #include <linux/mm.h>
-#include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -70,14 +68,12 @@ int compat_printk(const char *fmt, ...)
 	return ret;
 }
 
-#include "read_write.h"
-
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
  */
-asmlinkage long compat_sys_utime(const char __user *filename,
-				 struct compat_utimbuf __user *t)
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
+		       struct compat_utimbuf __user *, t)
 {
 	struct timespec tv[2];
 
@@ -91,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename,
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
 {
 	struct timespec tv[2];
 
 	if  (t) {
-		if (get_compat_timespec(&tv[0], &t[0]) ||
-		    get_compat_timespec(&tv[1], &t[1]))
+		if (compat_get_timespec(&tv[0], &t[0]) ||
+		    compat_get_timespec(&tv[1], &t[1]))
 			return -EFAULT;
 
 		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -106,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
 	return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
 
-asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
 {
 	struct timespec tv[2];
 
@@ -125,52 +121,46 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena
 	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
 {
 	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
 
 static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 {
-	compat_ino_t ino = stat->ino;
-	typeof(ubuf->st_uid) uid = 0;
-	typeof(ubuf->st_gid) gid = 0;
-	int err;
+	struct compat_stat tmp;
 
-	SET_UID(uid, stat->uid);
-	SET_GID(gid, stat->gid);
+	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
 
-	if ((u64) stat->size > MAX_NON_LFS ||
-	    !old_valid_dev(stat->dev) ||
-	    !old_valid_dev(stat->rdev))
+	memset(&tmp, 0, sizeof(tmp));
+	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
 		return -EOVERFLOW;
-	if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
 		return -EOVERFLOW;
-
-	if (clear_user(ubuf, sizeof(*ubuf)))
-		return -EFAULT;
-
-	err  = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
-	err |= __put_user(ino, &ubuf->st_ino);
-	err |= __put_user(stat->mode, &ubuf->st_mode);
-	err |= __put_user(stat->nlink, &ubuf->st_nlink);
-	err |= __put_user(uid, &ubuf->st_uid);
-	err |= __put_user(gid, &ubuf->st_gid);
-	err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
-	err |= __put_user(stat->size, &ubuf->st_size);
-	err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
-	err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
-	err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
-	err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
-	err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
-	err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
-	err |= __put_user(stat->blksize, &ubuf->st_blksize);
-	err |= __put_user(stat->blocks, &ubuf->st_blocks);
-	return err;
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+	if ((u64) stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long compat_sys_newstat(const char __user * filename,
-		struct compat_stat __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		       struct compat_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -181,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename,
 	return cp_compat_stat(&stat, statbuf);
 }
 
-asmlinkage long compat_sys_newlstat(const char __user * filename,
-		struct compat_stat __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		       struct compat_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -194,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename,
 }
 
 #ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd,
-		const char __user *filename,
-		struct compat_stat __user *statbuf, int flag)
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
+		       const char __user *, filename,
+		       struct compat_stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
 	int error;
@@ -208,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd,
 }
 #endif
 
-asmlinkage long compat_sys_newfstat(unsigned int fd,
-		struct compat_stat __user * statbuf)
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
+		       struct compat_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -247,11 +237,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
 	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
 	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
-	    __put_user(0, &ubuf->f_spare[0]) || 
-	    __put_user(0, &ubuf->f_spare[1]) || 
-	    __put_user(0, &ubuf->f_spare[2]) || 
-	    __put_user(0, &ubuf->f_spare[3]) || 
-	    __put_user(0, &ubuf->f_spare[4]))
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
 		return -EFAULT;
 	return 0;
 }
@@ -260,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
  * The following statfs calls are copies of code from fs/statfs.c and
  * should be checked against those from time to time
  */
-asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
 {
 	struct kstatfs tmp;
 	int error = user_statfs(pathname, &tmp);
@@ -269,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
 	return error;
 }
 
-asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
 {
 	struct kstatfs tmp;
 	int error = fd_statfs(fd, &tmp);
@@ -311,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 	return 0;
 }
 
-asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
 {
 	struct kstatfs tmp;
 	int error;
@@ -325,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
 	return error;
 }
 
-asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
 {
 	struct kstatfs tmp;
 	int error;
@@ -344,18 +331,11 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
  * Given how simple this syscall is that apporach is more maintainable
  * than the various conversion hacks.
  */
-asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
 {
-	struct super_block *sb;
 	struct compat_ustat tmp;
 	struct kstatfs sbuf;
-	int err;
-
-	sb = user_get_super(new_decode_dev(dev));
-	if (!sb)
-		return -EINVAL;
-	err = statfs_by_dentry(sb->s_root, &sbuf);
-	drop_super(sb);
+	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
 	if (err)
 		return err;
 
@@ -419,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u
 }
 #endif
 
-asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
-		unsigned long arg)
+static unsigned int
+convert_fcntl_cmd(unsigned int cmd)
+{
+	switch (cmd) {
+	case F_GETLK64:
+		return F_GETLK;
+	case F_SETLK64:
+		return F_SETLK;
+	case F_SETLKW64:
+		return F_SETLKW;
+	}
+
+	return cmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg)
 {
 	mm_segment_t old_fs;
 	struct flock f;
 	long ret;
+	unsigned int conv_cmd;
 
 	switch (cmd) {
 	case F_GETLK:
@@ -461,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
 	case F_GETLK64:
 	case F_SETLK64:
 	case F_SETLKW64:
+	case F_OFD_GETLK:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
 		ret = get_compat_flock64(&f, compat_ptr(arg));
 		if (ret != 0)
 			break;
 		old_fs = get_fs();
 		set_fs(KERNEL_DS);
-		ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK :
-				((cmd == F_SETLK64) ? F_SETLK : F_SETLKW),
-				(unsigned long)&f);
+		conv_cmd = convert_fcntl_cmd(cmd);
+		ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
 		set_fs(old_fs);
-		if (cmd == F_GETLK64 && ret == 0) {
+		if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
 			/* need to return lock information - see above for commentary */
 			if (f.l_start > COMPAT_LOFF_T_MAX)
 				ret = -EOVERFLOW;
@@ -488,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
 	return ret;
 }
 
-asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
-		unsigned long arg)
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg)
 {
-	if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64))
+	switch (cmd) {
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+	case F_OFD_GETLK:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
 		return -EINVAL;
+	}
 	return compat_sys_fcntl64(fd, cmd, arg);
 }
 
-asmlinkage long
-compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
 {
 	long ret;
 	aio_context_t ctx64;
@@ -516,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
 	return ret;
 }
 
-asmlinkage long
-compat_sys_io_getevents(aio_context_t ctx_id,
-				 unsigned long min_nr,
-				 unsigned long nr,
-				 struct io_event __user *events,
-				 struct compat_timespec __user *timeout)
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
+		       compat_long_t, min_nr,
+		       compat_long_t, nr,
+		       struct io_event __user *, events,
+		       struct compat_timespec __user *, timeout)
 {
-	long ret;
 	struct timespec t;
 	struct timespec __user *ut = NULL;
 
-	ret = -EFAULT;
-	if (unlikely(!access_ok(VERIFY_WRITE, events, 
-				nr * sizeof(struct io_event))))
-		goto out;
 	if (timeout) {
-		if (get_compat_timespec(&t, timeout))
-			goto out;
+		if (compat_get_timespec(&t, timeout))
+			return -EFAULT;
 
 		ut = compat_alloc_user_space(sizeof(*ut));
 		if (copy_to_user(ut, &t, sizeof(t)) )
-			goto out;
+			return -EFAULT;
 	} 
-	ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut);
-out:
-	return ret;
+	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
 }
 
 /* A write operation does a read from user space and vice versa */
@@ -576,6 +572,10 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	}
 	*ret_pointer = iov;
 
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+		goto out;
+
 	/*
 	 * Single unix specification:
 	 * We should -EINVAL if an element length is not >= 0 and fitting an
@@ -597,7 +597,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
 		}
 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 			goto out;
-		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+		if (type >= 0 &&
+		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -632,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 
 #define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
 
-asmlinkage long
-compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+		       int, nr, u32 __user *, iocb)
 {
 	struct iocb __user * __user *iocb64; 
 	long ret;
@@ -785,24 +786,24 @@ static int do_nfs4_super_data_conv(void *raw_data)
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
 
-asmlinkage long compat_sys_mount(const char __user * dev_name,
-				 const char __user * dir_name,
-				 const char __user * type, unsigned long flags,
-				 const void __user * data)
+COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
+		       const char __user *, dir_name,
+		       const char __user *, type, compat_ulong_t, flags,
+		       const void __user *, data)
 {
 	char *kernel_type;
 	unsigned long data_page;
 	char *kernel_dev;
-	char *dir_page;
+	struct filename *dir;
 	int retval;
 
 	retval = copy_mount_string(type, &kernel_type);
 	if (retval < 0)
 		goto out;
 
-	dir_page = getname(dir_name);
-	retval = PTR_ERR(dir_page);
-	if (IS_ERR(dir_page))
+	dir = getname(dir_name);
+	retval = PTR_ERR(dir);
+	if (IS_ERR(dir))
 		goto out1;
 
 	retval = copy_mount_string(dev_name, &kernel_dev);
@@ -824,7 +825,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
 		}
 	}
 
-	retval = do_mount(kernel_dev, dir_page, kernel_type,
+	retval = do_mount(kernel_dev, dir->name, kernel_type,
 			flags, (void*)data_page);
 
  out4:
@@ -832,7 +833,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
  out3:
 	kfree(kernel_dev);
  out2:
-	putname(dir_page);
+	putname(dir);
  out1:
 	kfree(kernel_type);
  out:
@@ -847,6 +848,7 @@ struct compat_old_linux_dirent {
 };
 
 struct compat_readdir_callback {
+	struct dir_context ctx;
 	struct compat_old_linux_dirent __user *dirent;
 	int result;
 };
@@ -883,27 +885,24 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long compat_sys_old_readdir(unsigned int fd,
-	struct compat_old_linux_dirent __user *dirent, unsigned int count)
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
-	struct file *file;
-	struct compat_readdir_callback buf;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
+	struct fd f = fdget(fd);
+	struct compat_readdir_callback buf = {
+		.ctx.actor = compat_fillonedir,
+		.dirent = dirent
+	};
 
-	buf.result = 0;
-	buf.dirent = dirent;
+	if (!f.file)
+		return -EBADF;
 
-	error = vfs_readdir(file, compat_fillonedir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
-	fput(file);
-out:
+	fdput(f);
 	return error;
 }
 
@@ -915,6 +914,7 @@ struct compat_linux_dirent {
 };
 
 struct compat_getdents_callback {
+	struct dir_context ctx;
 	struct compat_linux_dirent __user *current_dir;
 	struct compat_linux_dirent __user *previous;
 	int count;
@@ -964,46 +964,43 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long compat_sys_getdents(unsigned int fd,
-		struct compat_linux_dirent __user *dirent, unsigned int count)
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct compat_linux_dirent __user *, dirent, unsigned int, count)
 {
-	struct file * file;
+	struct fd f;
 	struct compat_linux_dirent __user * lastdirent;
-	struct compat_getdents_callback buf;
+	struct compat_getdents_callback buf = {
+		.ctx.actor = compat_filldir,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
-	error = -EFAULT;
 	if (!access_ok(VERIFY_WRITE, dirent, count))
-		goto out;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
+		return -EFAULT;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
 
-	error = vfs_readdir(file, compat_filldir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(file->f_pos, &lastdirent->d_off))
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
-	fput(file);
-out:
+	fdput(f);
 	return error;
 }
 
-#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
+#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
 
 struct compat_getdents_callback64 {
+	struct dir_context ctx;
 	struct linux_dirent64 __user *current_dir;
 	struct linux_dirent64 __user *previous;
 	int count;
@@ -1052,246 +1049,46 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long compat_sys_getdents64(unsigned int fd,
-		struct linux_dirent64 __user * dirent, unsigned int count)
+COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
-	struct file * file;
+	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
-	struct compat_getdents_callback64 buf;
+	struct compat_getdents_callback64 buf = {
+		.ctx.actor = compat_filldir64,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
-	error = -EFAULT;
 	if (!access_ok(VERIFY_WRITE, dirent, count))
-		goto out;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
+		return -EFAULT;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
 
-	error = vfs_readdir(file, compat_filldir64, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = file->f_pos;
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
 		if (__put_user_unaligned(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
-	fput(file);
-out:
+	fdput(f);
 	return error;
 }
-#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
-
-static ssize_t compat_do_readv_writev(int type, struct file *file,
-			       const struct compat_iovec __user *uvector,
-			       unsigned long nr_segs, loff_t *pos)
-{
-	compat_ssize_t tot_len;
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	ssize_t ret;
-	io_fn_t fn;
-	iov_fn_t fnv;
-
-	ret = -EINVAL;
-	if (!file->f_op)
-		goto out;
-
-	ret = -EFAULT;
-	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
-		goto out;
-
-	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-					       UIO_FASTIOV, iovstack, &iov);
-	if (tot_len == 0) {
-		ret = 0;
-		goto out;
-	}
-
-	ret = rw_verify_area(type, file, pos, tot_len);
-	if (ret < 0)
-		goto out;
-
-	fnv = NULL;
-	if (type == READ) {
-		fn = file->f_op->read;
-		fnv = file->f_op->aio_read;
-	} else {
-		fn = (io_fn_t)file->f_op->write;
-		fnv = file->f_op->aio_write;
-	}
-
-	if (fnv)
-		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
-						pos, fnv);
-	else
-		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
-
-out:
-	if (iov != iovstack)
-		kfree(iov);
-	if ((ret + (type == READ)) > 0) {
-		if (type == READ)
-			fsnotify_access(file);
-		else
-			fsnotify_modify(file);
-	}
-	return ret;
-}
-
-static size_t compat_readv(struct file *file,
-			   const struct compat_iovec __user *vec,
-			   unsigned long vlen, loff_t *pos)
-{
-	ssize_t ret = -EBADF;
-
-	if (!(file->f_mode & FMODE_READ))
-		goto out;
-
-	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
-		goto out;
-
-	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
-
-out:
-	if (ret > 0)
-		add_rchar(current, ret);
-	inc_syscr(current);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
-		 unsigned long vlen)
-{
-	struct file *file;
-	int fput_needed;
-	ssize_t ret;
-
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		return -EBADF;
-	ret = compat_readv(file, vec, vlen, &file->f_pos);
-	fput_light(file, fput_needed);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen, u32 pos_low, u32 pos_high)
-{
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-	struct file *file;
-	int fput_needed;
-	ssize_t ret;
-
-	if (pos < 0)
-		return -EINVAL;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		return -EBADF;
-	ret = -ESPIPE;
-	if (file->f_mode & FMODE_PREAD)
-		ret = compat_readv(file, vec, vlen, &pos);
-	fput_light(file, fput_needed);
-	return ret;
-}
-
-static size_t compat_writev(struct file *file,
-			    const struct compat_iovec __user *vec,
-			    unsigned long vlen, loff_t *pos)
-{
-	ssize_t ret = -EBADF;
-
-	if (!(file->f_mode & FMODE_WRITE))
-		goto out;
-
-	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
-		goto out;
-
-	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
-
-out:
-	if (ret > 0)
-		add_wchar(current, ret);
-	inc_syscw(current);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen)
-{
-	struct file *file;
-	int fput_needed;
-	ssize_t ret;
-
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		return -EBADF;
-	ret = compat_writev(file, vec, vlen, &file->f_pos);
-	fput_light(file, fput_needed);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
-		   unsigned long vlen, u32 pos_low, u32 pos_high)
-{
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-	struct file *file;
-	int fput_needed;
-	ssize_t ret;
-
-	if (pos < 0)
-		return -EINVAL;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		return -EBADF;
-	ret = -ESPIPE;
-	if (file->f_mode & FMODE_PWRITE)
-		ret = compat_writev(file, vec, vlen, &pos);
-	fput_light(file, fput_needed);
-	return ret;
-}
-
-asmlinkage long
-compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
-		    unsigned int nr_segs, unsigned int flags)
-{
-	unsigned i;
-	struct iovec __user *iov;
-	if (nr_segs > UIO_MAXIOV)
-		return -EINVAL;
-	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
-	for (i = 0; i < nr_segs; i++) {
-		struct compat_iovec v;
-		if (get_user(v.iov_base, &iov32[i].iov_base) ||
-		    get_user(v.iov_len, &iov32[i].iov_len) ||
-		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
-		    put_user(v.iov_len, &iov[i].iov_len))
-			return -EFAULT;
-	}
-	return sys_vmsplice(fd, iov, nr_segs, flags);
-}
+#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
 
 /*
  * Exactly like fs/open.c:sys_open(), except that it doesn't set the
  * O_LARGEFILE flag.
  */
-asmlinkage long
-compat_sys_open(const char __user *filename, int flags, int mode)
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 {
 	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
@@ -1300,247 +1097,11 @@ compat_sys_open(const char __user *filename, int flags, int mode)
  * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
  * O_LARGEFILE flag.
  */
-asmlinkage long
-compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode)
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
 {
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
@@ -1742,9 +1303,9 @@ out_nofds:
 	return ret;
 }
 
-asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct compat_timeval __user *tvp)
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timeval __user *, tvp)
 {
 	struct timespec end_time, *to = NULL;
 	struct compat_timeval tv;
@@ -1775,7 +1336,7 @@ struct compat_sel_arg_struct {
 	compat_uptr_t tvp;
 };
 
-asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
 {
 	struct compat_sel_arg_struct a;
 
@@ -1785,7 +1346,6 @@ asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
 				 compat_ptr(a.exp), compat_ptr(a.tvp));
 }
 
-#ifdef HAVE_SET_RESTORE_SIGMASK
 static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
@@ -1837,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	return ret;
 }
 
-asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct compat_timespec __user *tsp, void __user *sig)
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timespec __user *, tsp, void __user *, sig)
 {
 	compat_size_t sigsetsize = 0;
 	compat_uptr_t up = 0;
@@ -1856,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
 				 sigsetsize);
 }
 
-asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
-	unsigned int nfds, struct compat_timespec __user *tsp,
-	const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+	unsigned int,  nfds, struct compat_timespec __user *, tsp,
+	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
@@ -1908,388 +1468,14 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 
 	return ret;
 }
-#endif /* HAVE_SET_RESTORE_SIGMASK */
-
-#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
-/* Stuff for NFS server syscalls... */
-struct compat_nfsctl_svc {
-	u16			svc32_port;
-	s32			svc32_nthreads;
-};
-
-struct compat_nfsctl_client {
-	s8			cl32_ident[NFSCLNT_IDMAX+1];
-	s32			cl32_naddr;
-	struct in_addr		cl32_addrlist[NFSCLNT_ADDRMAX];
-	s32			cl32_fhkeytype;
-	s32			cl32_fhkeylen;
-	u8			cl32_fhkey[NFSCLNT_KEYMAX];
-};
-
-struct compat_nfsctl_export {
-	char		ex32_client[NFSCLNT_IDMAX+1];
-	char		ex32_path[NFS_MAXPATHLEN+1];
-	compat_dev_t	ex32_dev;
-	compat_ino_t	ex32_ino;
-	compat_int_t	ex32_flags;
-	__compat_uid_t	ex32_anon_uid;
-	__compat_gid_t	ex32_anon_gid;
-};
-
-struct compat_nfsctl_fdparm {
-	struct sockaddr		gd32_addr;
-	s8			gd32_path[NFS_MAXPATHLEN+1];
-	compat_int_t		gd32_version;
-};
-
-struct compat_nfsctl_fsparm {
-	struct sockaddr		gd32_addr;
-	s8			gd32_path[NFS_MAXPATHLEN+1];
-	compat_int_t		gd32_maxlen;
-};
-
-struct compat_nfsctl_arg {
-	compat_int_t		ca32_version;	/* safeguard */
-	union {
-		struct compat_nfsctl_svc	u32_svc;
-		struct compat_nfsctl_client	u32_client;
-		struct compat_nfsctl_export	u32_export;
-		struct compat_nfsctl_fdparm	u32_getfd;
-		struct compat_nfsctl_fsparm	u32_getfs;
-	} u;
-#define ca32_svc	u.u32_svc
-#define ca32_client	u.u32_client
-#define ca32_export	u.u32_export
-#define ca32_getfd	u.u32_getfd
-#define ca32_getfs	u.u32_getfs
-};
-
-union compat_nfsctl_res {
-	__u8			cr32_getfh[NFS_FHSIZE];
-	struct knfsd_fh		cr32_getfs;
-};
-
-static int compat_nfs_svc_trans(struct nfsctl_arg *karg,
-				struct compat_nfsctl_arg __user *arg)
-{
-	if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) ||
-		get_user(karg->ca_version, &arg->ca32_version) ||
-		__get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) ||
-		__get_user(karg->ca_svc.svc_nthreads,
-				&arg->ca32_svc.svc32_nthreads))
-		return -EFAULT;
-	return 0;
-}
-
-static int compat_nfs_clnt_trans(struct nfsctl_arg *karg,
-				struct compat_nfsctl_arg __user *arg)
-{
-	if (!access_ok(VERIFY_READ, &arg->ca32_client,
-			sizeof(arg->ca32_client)) ||
-		get_user(karg->ca_version, &arg->ca32_version) ||
-		__copy_from_user(&karg->ca_client.cl_ident[0],
-				&arg->ca32_client.cl32_ident[0],
-				NFSCLNT_IDMAX) ||
-		__get_user(karg->ca_client.cl_naddr,
-				&arg->ca32_client.cl32_naddr) ||
-		__copy_from_user(&karg->ca_client.cl_addrlist[0],
-				&arg->ca32_client.cl32_addrlist[0],
-				(sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) ||
-		__get_user(karg->ca_client.cl_fhkeytype,
-				&arg->ca32_client.cl32_fhkeytype) ||
-		__get_user(karg->ca_client.cl_fhkeylen,
-				&arg->ca32_client.cl32_fhkeylen) ||
-		__copy_from_user(&karg->ca_client.cl_fhkey[0],
-				&arg->ca32_client.cl32_fhkey[0],
-				NFSCLNT_KEYMAX))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int compat_nfs_exp_trans(struct nfsctl_arg *karg,
-				struct compat_nfsctl_arg __user *arg)
-{
-	if (!access_ok(VERIFY_READ, &arg->ca32_export,
-				sizeof(arg->ca32_export)) ||
-		get_user(karg->ca_version, &arg->ca32_version) ||
-		__copy_from_user(&karg->ca_export.ex_client[0],
-				&arg->ca32_export.ex32_client[0],
-				NFSCLNT_IDMAX) ||
-		__copy_from_user(&karg->ca_export.ex_path[0],
-				&arg->ca32_export.ex32_path[0],
-				NFS_MAXPATHLEN) ||
-		__get_user(karg->ca_export.ex_dev,
-				&arg->ca32_export.ex32_dev) ||
-		__get_user(karg->ca_export.ex_ino,
-				&arg->ca32_export.ex32_ino) ||
-		__get_user(karg->ca_export.ex_flags,
-				&arg->ca32_export.ex32_flags) ||
-		__get_user(karg->ca_export.ex_anon_uid,
-				&arg->ca32_export.ex32_anon_uid) ||
-		__get_user(karg->ca_export.ex_anon_gid,
-				&arg->ca32_export.ex32_anon_gid))
-		return -EFAULT;
-	SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid);
-	SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid);
-
-	return 0;
-}
-
-static int compat_nfs_getfd_trans(struct nfsctl_arg *karg,
-				struct compat_nfsctl_arg __user *arg)
-{
-	if (!access_ok(VERIFY_READ, &arg->ca32_getfd,
-			sizeof(arg->ca32_getfd)) ||
-		get_user(karg->ca_version, &arg->ca32_version) ||
-		__copy_from_user(&karg->ca_getfd.gd_addr,
-				&arg->ca32_getfd.gd32_addr,
-				(sizeof(struct sockaddr))) ||
-		__copy_from_user(&karg->ca_getfd.gd_path,
-				&arg->ca32_getfd.gd32_path,
-				(NFS_MAXPATHLEN+1)) ||
-		__get_user(karg->ca_getfd.gd_version,
-				&arg->ca32_getfd.gd32_version))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int compat_nfs_getfs_trans(struct nfsctl_arg *karg,
-				struct compat_nfsctl_arg __user *arg)
-{
-	if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) ||
-		get_user(karg->ca_version, &arg->ca32_version) ||
-		__copy_from_user(&karg->ca_getfs.gd_addr,
-				&arg->ca32_getfs.gd32_addr,
-				(sizeof(struct sockaddr))) ||
-		__copy_from_user(&karg->ca_getfs.gd_path,
-				&arg->ca32_getfs.gd32_path,
-				(NFS_MAXPATHLEN+1)) ||
-		__get_user(karg->ca_getfs.gd_maxlen,
-				&arg->ca32_getfs.gd32_maxlen))
-		return -EFAULT;
-
-	return 0;
-}
-
-/* This really doesn't need translations, we are only passing
- * back a union which contains opaque nfs file handle data.
- */
-static int compat_nfs_getfh_res_trans(union nfsctl_res *kres,
-				union compat_nfsctl_res __user *res)
-{
-	int err;
-
-	err = copy_to_user(res, kres, sizeof(*res));
-
-	return (err) ? -EFAULT : 0;
-}
-
-asmlinkage long compat_sys_nfsservctl(int cmd,
-				struct compat_nfsctl_arg __user *arg,
-				union compat_nfsctl_res __user *res)
-{
-	struct nfsctl_arg *karg;
-	union nfsctl_res *kres;
-	mm_segment_t oldfs;
-	int err;
-
-	karg = kmalloc(sizeof(*karg), GFP_USER);
-	kres = kmalloc(sizeof(*kres), GFP_USER);
-	if(!karg || !kres) {
-		err = -ENOMEM;
-		goto done;
-	}
-
-	switch(cmd) {
-	case NFSCTL_SVC:
-		err = compat_nfs_svc_trans(karg, arg);
-		break;
-
-	case NFSCTL_ADDCLIENT:
-		err = compat_nfs_clnt_trans(karg, arg);
-		break;
-
-	case NFSCTL_DELCLIENT:
-		err = compat_nfs_clnt_trans(karg, arg);
-		break;
-
-	case NFSCTL_EXPORT:
-	case NFSCTL_UNEXPORT:
-		err = compat_nfs_exp_trans(karg, arg);
-		break;
-
-	case NFSCTL_GETFD:
-		err = compat_nfs_getfd_trans(karg, arg);
-		break;
-
-	case NFSCTL_GETFS:
-		err = compat_nfs_getfs_trans(karg, arg);
-		break;
-
-	default:
-		err = -EINVAL;
-		break;
-	}
-
-	if (err)
-		goto done;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	/* The __user pointer casts are valid because of the set_fs() */
-	err = sys_nfsservctl(cmd, (void __user *) karg, (void __user *) kres);
-	set_fs(oldfs);
-
-	if (err)
-		goto done;
-
-	if((cmd == NFSCTL_GETFD) ||
-	   (cmd == NFSCTL_GETFS))
-		err = compat_nfs_getfh_res_trans(kres, res);
-
-done:
-	kfree(karg);
-	kfree(kres);
-	return err;
-}
-#else /* !NFSD */
-long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
-{
-	return sys_ni_syscall();
-}
-#endif
-
-#ifdef CONFIG_EPOLL
-
-#ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long compat_sys_epoll_pwait(int epfd,
-			struct compat_epoll_event __user *events,
-			int maxevents, int timeout,
-			const compat_sigset_t __user *sigmask,
-			compat_size_t sigsetsize)
-{
-	long err;
-	compat_sigset_t csigmask;
-	sigset_t ksigmask, sigsaved;
-
-	/*
-	 * If the caller wants a certain signal mask to be set during the wait,
-	 * we apply it here.
-	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	err = sys_epoll_wait(epfd, events, maxevents, timeout);
-
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (err == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
-
-	return err;
-}
-#endif /* HAVE_SET_RESTORE_SIGMASK */
-
-#endif /* CONFIG_EPOLL */
-
-#ifdef CONFIG_SIGNALFD
-
-asmlinkage long compat_sys_signalfd4(int ufd,
-				     const compat_sigset_t __user *sigmask,
-				     compat_size_t sigsetsize, int flags)
-{
-	compat_sigset_t ss32;
-	sigset_t tmp;
-	sigset_t __user *ksigmask;
-
-	if (sigsetsize != sizeof(compat_sigset_t))
-		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
-	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
-	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
-		return -EFAULT;
-
-	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
-}
-
-asmlinkage long compat_sys_signalfd(int ufd,
-				    const compat_sigset_t __user *sigmask,
-				    compat_size_t sigsetsize)
-{
-	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
-}
-#endif /* CONFIG_SIGNALFD */
-
-#ifdef CONFIG_TIMERFD
-
-asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
-				   const struct compat_itimerspec __user *utmr,
-				   struct compat_itimerspec __user *otmr)
-{
-	int error;
-	struct itimerspec t;
-	struct itimerspec __user *ut;
-
-	if (get_compat_itimerspec(&t, utmr))
-		return -EFAULT;
-	ut = compat_alloc_user_space(2 * sizeof(struct itimerspec));
-	if (copy_to_user(&ut[0], &t, sizeof(t)))
-		return -EFAULT;
-	error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]);
-	if (!error && otmr)
-		error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) ||
-			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
-	return error;
-}
-
-asmlinkage long compat_sys_timerfd_gettime(int ufd,
-				   struct compat_itimerspec __user *otmr)
-{
-	int error;
-	struct itimerspec t;
-	struct itimerspec __user *ut;
-
-	ut = compat_alloc_user_space(sizeof(struct itimerspec));
-	error = sys_timerfd_gettime(ufd, ut);
-	if (!error)
-		error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) ||
-			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
-	return error;
-}
-
-#endif /* CONFIG_TIMERFD */
 
 #ifdef CONFIG_FHANDLE
 /*
  * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
  * doesn't set the O_LARGEFILE flag.
  */
-asmlinkage long
-compat_sys_open_by_handle_at(int mountdirfd,
-			     struct file_handle __user *handle, int flags)
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+			     struct file_handle __user *, handle, int, flags)
 {
 	return do_handle_open(mountdirfd, handle, flags);
 }
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 112e45a17e9..4d24d17bcfc 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -38,6 +38,13 @@
 #define elf_addr_t	Elf32_Addr
 
 /*
+ * Some data types as stored in coredump.
+ */
+#define user_long_t		compat_long_t
+#define user_siginfo_t		compat_siginfo_t
+#define copy_siginfo_to_user	copy_siginfo_to_user32
+
+/*
  * The machine-dependent core note format types are defined in elfcore-compat.h,
  * which requires asm/elf.h to define compat_elf_gregset_t et al.
  */
@@ -81,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
 #define	ELF_HWCAP		COMPAT_ELF_HWCAP
 #endif
 
+#ifdef	COMPAT_ELF_HWCAP2
+#undef	ELF_HWCAP2
+#define	ELF_HWCAP2		COMPAT_ELF_HWCAP2
+#endif
+
 #ifdef	COMPAT_ARCH_DLINFO
 #undef	ARCH_DLINFO
 #define	ARCH_DLINFO		COMPAT_ARCH_DLINFO
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 61abb638b4b..e8228904727 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -34,7 +34,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/ppp_defs.h>
-#include <linux/if_ppp.h>
+#include <linux/ppp-ioctl.h>
 #include <linux/if_pppox.h>
 #include <linux/mtio.h>
 #include <linux/auto_fs.h>
@@ -49,7 +49,6 @@
 #include <linux/elevator.h>
 #include <linux/rtc.h>
 #include <linux/pci.h>
-#include <linux/module.h>
 #include <linux/serial.h>
 #include <linux/if_tun.h>
 #include <linux/ctype.h>
@@ -67,7 +66,8 @@
 #include <linux/gigaset_dev.h>
 
 #ifdef CONFIG_BLOCK
-#include <linux/loop.h>
+#include <linux/cdrom.h>
+#include <linux/fd.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
@@ -103,6 +103,7 @@
 
 #include <linux/hiddev.h>
 
+#define __DVB_CORE__
 #include <linux/dvb/audio.h>
 #include <linux/dvb/dmx.h>
 #include <linux/dvb/frontend.h>
@@ -208,6 +209,8 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
 
 	err  = get_user(palp, &up->palette);
 	err |= get_user(length, &up->length);
+	if (err)
+		return -EFAULT;
 
 	up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
 	err  = put_user(compat_ptr(palp), &up_native->palette);
@@ -604,7 +607,6 @@ struct serial_struct32 {
 static int serial_struct_ioctl(unsigned fd, unsigned cmd,
 			struct serial_struct32 __user *ss32)
 {
-        typedef struct serial_struct SS;
         typedef struct serial_struct32 SS32;
         int err;
         struct serial_struct ss;
@@ -678,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
 	struct i2c_msg			__user *tmsgs;
 	struct i2c_msg32		__user *umsgs;
 	compat_caddr_t			datap;
-	int				nmsgs, i;
+	u32				nmsgs;
+	int				i;
 
 	if (get_user(nmsgs, &udata->nmsgs))
 		return -EFAULT;
@@ -840,6 +843,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
+COMPATIBLE_IOCTL(TIOCGPKT)
+COMPATIBLE_IOCTL(TIOCGPTLCK)
+COMPATIBLE_IOCTL(TIOCGEXCL)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
 COMPATIBLE_IOCTL(TIOCSETD)
@@ -864,6 +870,12 @@ COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
 COMPATIBLE_IOCTL(TIOCSIG)
+#ifdef TIOCSRS485
+COMPATIBLE_IOCTL(TIOCSRS485)
+#endif
+#ifdef TIOCGRS485
+COMPATIBLE_IOCTL(TIOCGRS485)
+#endif
 #ifdef TCGETS2
 COMPATIBLE_IOCTL(TCGETS2)
 COMPATIBLE_IOCTL(TCSETS2)
@@ -895,6 +907,8 @@ COMPATIBLE_IOCTL(KDGKBSENT)
 COMPATIBLE_IOCTL(KDSKBSENT)
 COMPATIBLE_IOCTL(KDGKBDIACR)
 COMPATIBLE_IOCTL(KDSKBDIACR)
+COMPATIBLE_IOCTL(KDGKBDIACRUC)
+COMPATIBLE_IOCTL(KDSKBDIACRUC)
 COMPATIBLE_IOCTL(KDKBDREP)
 COMPATIBLE_IOCTL(KDGKBLED)
 COMPATIBLE_IOCTL(KDGETLED)
@@ -940,10 +954,11 @@ COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 /* md calls this on random blockdevs */
 IGNORE_IOCTL(RAID_VERSION)
+/* qemu/qemu-img might call these two on plain files for probing */
+IGNORE_IOCTL(CDROM_DRIVE_STATUS)
+IGNORE_IOCTL(FDGETPRM32)
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -998,6 +1013,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT)
 COMPATIBLE_IOCTL(PPPIOCDISCONN)
 COMPATIBLE_IOCTL(PPPIOCATTCHAN)
 COMPATIBLE_IOCTL(PPPIOCGCHAN)
+COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
 /* PPPOX */
 COMPATIBLE_IOCTL(PPPOEIOCSFWD)
 COMPATIBLE_IOCTL(PPPOEIOCDFWD)
@@ -1500,35 +1516,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	return -ENOIOCTLCMD;
 }
 
-static void compat_ioctl_error(struct file *filp, unsigned int fd,
-		unsigned int cmd, unsigned long arg)
-{
-	char buf[10];
-	char *fn = "?";
-	char *path;
-
-	/* find the name of the device. */
-	path = (char *)__get_free_page(GFP_KERNEL);
-	if (path) {
-		fn = d_path(&filp->f_path, path, PAGE_SIZE);
-		if (IS_ERR(fn))
-			fn = "?";
-	}
-
-	 sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
-	if (!isprint(buf[1]))
-		sprintf(buf, "%02x", buf[1]);
-	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
-			"cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
-			current->comm, current->pid,
-			(int)fd, (unsigned int)cmd, buf,
-			(cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
-			(unsigned int)arg, fn);
-
-	if (path)
-		free_page((unsigned long)path);
-}
-
 static int compat_ioctl_check_table(unsigned int xcmd)
 {
 	int i;
@@ -1551,19 +1538,17 @@ static int compat_ioctl_check_table(unsigned int xcmd)
 	return ioctl_pointer[i] == xcmd;
 }
 
-asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
-				unsigned long arg)
+COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg32)
 {
-	struct file *filp;
+	unsigned long arg = arg32;
+	struct fd f = fdget(fd);
 	int error = -EBADF;
-	int fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (!filp)
+	if (!f.file)
 		goto out;
 
 	/* RED-PEN how should LSM module know it's handling 32bit? */
-	error = security_file_ioctl(filp, cmd, arg);
+	error = security_file_ioctl(f.file, cmd, arg);
 	if (error)
 		goto out_fput;
 
@@ -1583,30 +1568,30 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
 	case FS_IOC_RESVSP_32:
 	case FS_IOC_RESVSP64_32:
-		error = compat_ioctl_preallocate(filp, compat_ptr(arg));
+		error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
 		goto out_fput;
 #else
 	case FS_IOC_RESVSP:
 	case FS_IOC_RESVSP64:
-		error = ioctl_preallocate(filp, compat_ptr(arg));
+		error = ioctl_preallocate(f.file, compat_ptr(arg));
 		goto out_fput;
 #endif
 
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
-		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+		if (S_ISREG(file_inode(f.file)->i_mode))
 			break;
 		/*FALL THROUGH*/
 
 	default:
-		if (filp->f_op && filp->f_op->compat_ioctl) {
-			error = filp->f_op->compat_ioctl(filp, cmd, arg);
+		if (f.file->f_op->compat_ioctl) {
+			error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
 			if (error != -ENOIOCTLCMD)
 				goto out_fput;
 		}
 
-		if (!filp->f_op || !filp->f_op->unlocked_ioctl)
+		if (!f.file->f_op->unlocked_ioctl)
 			goto do_ioctl;
 		break;
 	}
@@ -1614,23 +1599,18 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	if (compat_ioctl_check_table(XFORM(cmd)))
 		goto found_handler;
 
-	error = do_ioctl_trans(fd, cmd, arg, filp);
-	if (error == -ENOIOCTLCMD) {
-		static int count;
-
-		if (++count <= 50)
-			compat_ioctl_error(filp, fd, cmd, arg);
-		error = -EINVAL;
-	}
+	error = do_ioctl_trans(fd, cmd, arg, f.file);
+	if (error == -ENOIOCTLCMD)
+		error = -ENOTTY;
 
 	goto out_fput;
 
  found_handler:
 	arg = (unsigned long)compat_ptr(arg);
  do_ioctl:
-	error = do_vfs_ioctl(filp, fd, cmd, arg);
+	error = do_vfs_ioctl(f.file, fd, cmd, arg);
  out_fput:
-	fput_light(filp, fput_needed);
+	fdput(f);
  out:
 	return error;
 }
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 82bda8fdfc1..bd4a3c16709 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -24,6 +24,12 @@
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
@@ -58,13 +64,12 @@ struct configfs_dirent {
 extern struct mutex configfs_symlink_mutex;
 extern spinlock_t configfs_dirent_lock;
 
-extern struct vfsmount * configfs_mount;
 extern struct kmem_cache *configfs_dir_cachep;
 
 extern int configfs_is_root(struct config_item *item);
 
-extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
-extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
+extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
 extern int configfs_inode_init(void);
 extern void configfs_inode_exit(void);
 
@@ -80,15 +85,15 @@ extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
 extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
 extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
 
-extern int configfs_pin_fs(void);
+extern struct dentry *configfs_pin_fs(void);
 extern void configfs_release_fs(void);
 
 extern struct rw_semaphore configfs_rename_sem;
-extern struct super_block * configfs_sb;
 extern const struct file_operations configfs_dir_operations;
 extern const struct file_operations configfs_file_operations;
 extern const struct file_operations bin_fops;
 extern const struct inode_operations configfs_dir_inode_operations;
+extern const struct inode_operations configfs_root_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
 extern const struct dentry_operations configfs_dentry_ops;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 90ff3cb10de..668dcabc569 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -53,29 +53,31 @@ DEFINE_SPINLOCK(configfs_dirent_lock);
 static void configfs_d_iput(struct dentry * dentry,
 			    struct inode * inode)
 {
-	struct configfs_dirent * sd = dentry->d_fsdata;
+	struct configfs_dirent *sd = dentry->d_fsdata;
 
 	if (sd) {
-		BUG_ON(sd->s_dentry != dentry);
-		sd->s_dentry = NULL;
+		/* Coordinate with configfs_readdir */
+		spin_lock(&configfs_dirent_lock);
+		/* Coordinate with configfs_attach_attr where will increase
+		 * sd->s_count and update sd->s_dentry to new allocated one.
+		 * Only set sd->dentry to null when this dentry is the only
+		 * sd owner.
+		 * If not do so, configfs_d_iput may run just after
+		 * configfs_attach_attr and set sd->s_dentry to null
+		 * even it's still in use.
+		 */
+		if (atomic_read(&sd->s_count) <= 2)
+			sd->s_dentry = NULL;
+
+		spin_unlock(&configfs_dirent_lock);
 		configfs_put(sd);
 	}
 	iput(inode);
 }
 
-/*
- * We _must_ delete our dentries on last dput, as the chain-to-parent
- * behavior is required to clear the parents of default_groups.
- */
-static int configfs_d_delete(const struct dentry *dentry)
-{
-	return 1;
-}
-
 const struct dentry_operations configfs_dentry_ops = {
 	.d_iput		= configfs_d_iput,
-	/* simple_delete_dentry() isn't exported */
-	.d_delete	= configfs_d_delete,
+	.d_delete	= always_delete_dentry,
 };
 
 #ifdef CONFIG_LOCKDEP
@@ -261,11 +263,13 @@ static int init_symlink(struct inode * inode)
 	return 0;
 }
 
-static int create_dir(struct config_item * k, struct dentry * p,
-		      struct dentry * d)
+static int create_dir(struct config_item *k, struct dentry *d)
 {
 	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+	struct dentry *p = d->d_parent;
+
+	BUG_ON(!k);
 
 	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
 	if (!error)
@@ -301,19 +305,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
 
 static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
 {
-	struct dentry * parent;
-	int error = 0;
-
-	BUG_ON(!item);
-
-	if (item->ci_parent)
-		parent = item->ci_parent->ci_dentry;
-	else if (configfs_mount && configfs_mount->mnt_sb)
-		parent = configfs_mount->mnt_sb->s_root;
-	else
-		return -EFAULT;
-
-	error = create_dir(item,parent,dentry);
+	int error = create_dir(item, dentry);
 	if (!error)
 		item->ci_dentry = dentry;
 	return error;
@@ -394,7 +386,7 @@ static void remove_dir(struct dentry * d)
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
 
-	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
 
 	dput(parent);
 }
@@ -433,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 	struct configfs_attribute * attr = sd->s_element;
 	int error;
 
+	spin_lock(&configfs_dirent_lock);
 	dentry->d_fsdata = configfs_get(sd);
 	sd->s_dentry = dentry;
+	spin_unlock(&configfs_dirent_lock);
+
 	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
 				configfs_init_file);
 	if (error) {
@@ -449,7 +444,7 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 
 static struct dentry * configfs_lookup(struct inode *dir,
 				       struct dentry *dentry,
-				       struct nameidata *nd)
+				       unsigned int flags)
 {
 	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
 	struct configfs_dirent * sd;
@@ -667,19 +662,15 @@ static int create_default_group(struct config_group *parent_group,
 				struct config_group *group)
 {
 	int ret;
-	struct qstr name;
 	struct configfs_dirent *sd;
 	/* We trust the caller holds a reference to parent */
 	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
 
 	if (!group->cg_item.ci_name)
 		group->cg_item.ci_name = group->cg_item.ci_namebuf;
-	name.name = group->cg_item.ci_name;
-	name.len = strlen(name.name);
-	name.hash = full_name_hash(name.name, name.len);
 
 	ret = -ENOMEM;
-	child = d_alloc(parent, &name);
+	child = d_alloc_name(parent, group->cg_item.ci_name);
 	if (child) {
 		d_add(child, NULL);
 
@@ -689,7 +680,8 @@ static int create_default_group(struct config_group *parent_group,
 			sd = child->d_fsdata;
 			sd->s_type |= CONFIGFS_USET_DEFAULT;
 		} else {
-			d_delete(child);
+			BUG_ON(child->d_inode);
+			d_drop(child);
 			dput(child);
 		}
 	}
@@ -948,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item,
 #ifdef DEBUG
 static void configfs_dump_one(struct configfs_dirent *sd, int level)
 {
-	printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+	pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd));
 
-#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type);
 	type_print(CONFIGFS_ROOT);
 	type_print(CONFIGFS_DIR);
 	type_print(CONFIGFS_ITEM_ATTR);
@@ -990,7 +982,7 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
  * This describes these functions and their helpers.
  *
  * Allow another kernel system to depend on a config_item.  If this
- * happens, the item cannot go away until the dependant can live without
+ * happens, the item cannot go away until the dependent can live without
  * it.  The idea is to give client modules as simple an interface as
  * possible.  When a system asks them to depend on an item, they just
  * call configfs_depend_item().  If the item is live and the client
@@ -1043,10 +1035,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 static int configfs_depend_prep(struct dentry *origin,
 				struct config_item *target)
 {
-	struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+	struct configfs_dirent *child_sd, *sd;
 	int ret = 0;
 
-	BUG_ON(!origin || !sd);
+	BUG_ON(!origin || !origin->d_fsdata);
+	sd = origin->d_fsdata;
 
 	if (sd->s_element == target)  /* Boo-yah */
 		goto out;
@@ -1075,23 +1068,24 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
 	int ret;
 	struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
 	struct config_item *s_item = &subsys->su_group.cg_item;
+	struct dentry *root;
 
 	/*
 	 * Pin the configfs filesystem.  This means we can safely access
 	 * the root of the configfs filesystem.
 	 */
-	ret = configfs_pin_fs();
-	if (ret)
-		return ret;
+	root = configfs_pin_fs();
+	if (IS_ERR(root))
+		return PTR_ERR(root);
 
 	/*
 	 * Next, lock the root directory.  We're going to check that the
 	 * subsystem is really registered, and so we need to lock out
 	 * configfs_[un]register_subsystem().
 	 */
-	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_lock(&root->d_inode->i_mutex);
 
-	root_sd = configfs_sb->s_root->d_fsdata;
+	root_sd = root->d_fsdata;
 
 	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
 		if (p->s_type & CONFIGFS_DIR) {
@@ -1125,7 +1119,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
 out_unlock_dirent_lock:
 	spin_unlock(&configfs_dirent_lock);
 out_unlock_fs:
-	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_unlock(&root->d_inode->i_mutex);
 
 	/*
 	 * If we succeeded, the fs is pinned via other methods.  If not,
@@ -1166,7 +1160,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
 }
 EXPORT_SYMBOL(configfs_undepend_item);
 
-static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int ret = 0;
 	int module_got = 0;
@@ -1179,11 +1173,6 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct module *subsys_owner = NULL, *new_item_owner = NULL;
 	char *name;
 
-	if (dentry->d_parent == configfs_sb->s_root) {
-		ret = -EPERM;
-		goto out;
-	}
-
 	sd = dentry->d_parent->d_fsdata;
 
 	/*
@@ -1355,9 +1344,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
 	int ret;
 
-	if (dentry->d_parent == configfs_sb->s_root)
-		return -EPERM;
-
 	sd = dentry->d_fsdata;
 	if (sd->s_type & CONFIGFS_USET_DEFAULT)
 		return -EPERM;
@@ -1455,6 +1441,11 @@ const struct inode_operations configfs_dir_inode_operations = {
 	.setattr	= configfs_setattr,
 };
 
+const struct inode_operations configfs_root_inode_operations = {
+	.lookup		= configfs_lookup,
+	.setattr	= configfs_setattr,
+};
+
 #if 0
 int configfs_rename_dir(struct config_item * item, const char *new_name)
 {
@@ -1539,80 +1530,83 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
 	return (sd->s_mode >> 12) & 15;
 }
 
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
+	struct super_block *sb = dentry->d_sb;
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
-	struct configfs_dirent *cursor = filp->private_data;
+	struct configfs_dirent *cursor = file->private_data;
 	struct list_head *p, *q = &cursor->s_sibling;
-	ino_t ino;
-	int i = filp->f_pos;
+	ino_t ino = 0;
 
-	switch (i) {
-		case 0:
-			ino = dentry->d_inode->i_ino;
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		default:
-			if (filp->f_pos == 2) {
-				spin_lock(&configfs_dirent_lock);
-				list_move(q, &parent_sd->s_children);
-				spin_unlock(&configfs_dirent_lock);
-			}
-			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
-				struct configfs_dirent *next;
-				const char * name;
-				int len;
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	if (ctx->pos == 2) {
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, &parent_sd->s_children);
+		spin_unlock(&configfs_dirent_lock);
+	}
+	for (p = q->next; p != &parent_sd->s_children; p = p->next) {
+		struct configfs_dirent *next;
+		const char *name;
+		int len;
+		struct inode *inode = NULL;
+
+		next = list_entry(p, struct configfs_dirent, s_sibling);
+		if (!next->s_element)
+			continue;
 
-				next = list_entry(p, struct configfs_dirent,
-						   s_sibling);
-				if (!next->s_element)
-					continue;
+		name = configfs_get_name(next);
+		len = strlen(name);
 
-				name = configfs_get_name(next);
-				len = strlen(name);
-				if (next->s_dentry)
-					ino = next->s_dentry->d_inode->i_ino;
-				else
-					ino = iunique(configfs_sb, 2);
+		/*
+		 * We'll have a dentry and an inode for
+		 * PINNED items and for open attribute
+		 * files.  We lock here to prevent a race
+		 * with configfs_d_iput() clearing
+		 * s_dentry before calling iput().
+		 *
+		 * Why do we go to the trouble?  If
+		 * someone has an attribute file open,
+		 * the inode number should match until
+		 * they close it.  Beyond that, we don't
+		 * care.
+		 */
+		spin_lock(&configfs_dirent_lock);
+		dentry = next->s_dentry;
+		if (dentry)
+			inode = dentry->d_inode;
+		if (inode)
+			ino = inode->i_ino;
+		spin_unlock(&configfs_dirent_lock);
+		if (!inode)
+			ino = iunique(sb, 2);
 
-				if (filldir(dirent, name, len, filp->f_pos, ino,
-						 dt_type(next)) < 0)
-					return 0;
+		if (!dir_emit(ctx, name, len, ino, dt_type(next)))
+			return 0;
 
-				spin_lock(&configfs_dirent_lock);
-				list_move(q, p);
-				spin_unlock(&configfs_dirent_lock);
-				p = q;
-				filp->f_pos++;
-			}
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, p);
+		spin_unlock(&configfs_dirent_lock);
+		p = q;
+		ctx->pos++;
 	}
 	return 0;
 }
 
-static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry * dentry = file->f_path.dentry;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	switch (origin) {
+	switch (whence) {
 		case 1:
 			offset += file->f_pos;
 		case 0:
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+			mutex_unlock(&file_inode(file)->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -1647,43 +1641,39 @@ const struct file_operations configfs_dir_operations = {
 	.release	= configfs_dir_close,
 	.llseek		= configfs_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= configfs_readdir,
+	.iterate	= configfs_readdir,
 };
 
 int configfs_register_subsystem(struct configfs_subsystem *subsys)
 {
 	int err;
 	struct config_group *group = &subsys->su_group;
-	struct qstr name;
 	struct dentry *dentry;
+	struct dentry *root;
 	struct configfs_dirent *sd;
 
-	err = configfs_pin_fs();
-	if (err)
-		return err;
+	root = configfs_pin_fs();
+	if (IS_ERR(root))
+		return PTR_ERR(root);
 
 	if (!group->cg_item.ci_name)
 		group->cg_item.ci_name = group->cg_item.ci_namebuf;
 
-	sd = configfs_sb->s_root->d_fsdata;
+	sd = root->d_fsdata;
 	link_group(to_config_group(sd->s_element), group);
 
-	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
-			I_MUTEX_PARENT);
-
-	name.name = group->cg_item.ci_name;
-	name.len = strlen(name.name);
-	name.hash = full_name_hash(name.name, name.len);
+	mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
 
 	err = -ENOMEM;
-	dentry = d_alloc(configfs_sb->s_root, &name);
+	dentry = d_alloc_name(root, group->cg_item.ci_name);
 	if (dentry) {
 		d_add(dentry, NULL);
 
 		err = configfs_attach_group(sd->s_element, &group->cg_item,
 					    dentry);
 		if (err) {
-			d_delete(dentry);
+			BUG_ON(dentry->d_inode);
+			d_drop(dentry);
 			dput(dentry);
 		} else {
 			spin_lock(&configfs_dirent_lock);
@@ -1692,7 +1682,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 		}
 	}
 
-	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_unlock(&root->d_inode->i_mutex);
 
 	if (err) {
 		unlink_group(group);
@@ -1706,19 +1696,20 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 {
 	struct config_group *group = &subsys->su_group;
 	struct dentry *dentry = group->cg_item.ci_dentry;
+	struct dentry *root = dentry->d_sb->s_root;
 
-	if (dentry->d_parent != configfs_sb->s_root) {
-		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+	if (dentry->d_parent != root) {
+		pr_err("Tried to unregister non-subsystem!\n");
 		return;
 	}
 
-	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+	mutex_lock_nested(&root->d_inode->i_mutex,
 			  I_MUTEX_PARENT);
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
 	mutex_lock(&configfs_symlink_mutex);
 	spin_lock(&configfs_dirent_lock);
 	if (configfs_detach_prep(dentry, NULL)) {
-		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+		pr_err("Tried to unregister non-empty subsystem!\n");
 	}
 	spin_unlock(&configfs_dirent_lock);
 	mutex_unlock(&configfs_symlink_mutex);
@@ -1729,7 +1720,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 
 	d_delete(dentry);
 
-	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_unlock(&root->d_inode->i_mutex);
 
 	dput(dentry);
 
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14..1d1c41f1014 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
 	mutex_lock(&buffer->mutex);
 	len = fill_write_buffer(buffer, buf, count);
 	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, buffer, count);
+		len = flush_write_buffer(file->f_path.dentry, buffer, len);
 	if (len > 0)
 		*ppos += len;
 	mutex_unlock(&buffer->mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c83f4768eea..5946ad98053 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -23,7 +23,8 @@
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please see Documentation/filesystems/configfs.txt for more information.
+ * Please see Documentation/filesystems/configfs/configfs.txt for more
+ * information.
  */
 
 #undef DEBUG
@@ -43,8 +44,6 @@
 static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
 #endif
 
-extern struct super_block * configfs_sb;
-
 static const struct address_space_operations configfs_aops = {
 	.readpage	= simple_readpage,
 	.write_begin	= simple_write_begin,
@@ -80,8 +79,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 			return -ENOMEM;
 		/* assign default attributes */
 		sd_iattr->ia_mode = sd->s_mode;
-		sd_iattr->ia_uid = 0;
-		sd_iattr->ia_gid = 0;
+		sd_iattr->ia_uid = GLOBAL_ROOT_UID;
+		sd_iattr->ia_gid = GLOBAL_ROOT_GID;
 		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
 		sd->s_iattr = sd_iattr;
 	}
@@ -115,7 +114,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	return error;
 }
 
-static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
+static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
 {
 	inode->i_mode = mode;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -131,9 +130,10 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 	inode->i_ctime = iattr->ia_ctime;
 }
 
-struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
+struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
+				 struct super_block *s)
 {
-	struct inode * inode = new_inode(configfs_sb);
+	struct inode * inode = new_inode(s);
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
@@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
 			 * In practice the maximum level of locking depth is
 			 * already reached. Just inform about possible reasons.
 			 */
-			printk(KERN_INFO "configfs: Too many levels of inodes"
-			       " for the locking correctness validator.\n");
-			printk(KERN_INFO "Spurious warnings may appear.\n");
+			pr_info("Too many levels of inodes for the locking correctness validator.\n");
+			pr_info("Spurious warnings may appear.\n");
 		}
 	}
 }
@@ -184,39 +183,38 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
 
 #endif /* CONFIG_LOCKDEP */
 
-int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *))
 {
 	int error = 0;
-	struct inode * inode = NULL;
-	if (dentry) {
-		if (!dentry->d_inode) {
-			struct configfs_dirent *sd = dentry->d_fsdata;
-			if ((inode = configfs_new_inode(mode, sd))) {
-				if (dentry->d_parent && dentry->d_parent->d_inode) {
-					struct inode *p_inode = dentry->d_parent->d_inode;
-					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
-				}
-				configfs_set_inode_lock_class(sd, inode);
-				goto Proceed;
-			}
-			else
-				error = -ENOMEM;
-		} else
-			error = -EEXIST;
-	} else
-		error = -ENOENT;
-	goto Done;
+	struct inode *inode = NULL;
+	struct configfs_dirent *sd;
+	struct inode *p_inode;
+
+	if (!dentry)
+		return -ENOENT;
+
+	if (dentry->d_inode)
+		return -EEXIST;
 
- Proceed:
-	if (init)
+	sd = dentry->d_fsdata;
+	inode = configfs_new_inode(mode, sd, dentry->d_sb);
+	if (!inode)
+		return -ENOMEM;
+
+	p_inode = dentry->d_parent->d_inode;
+	p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+	configfs_set_inode_lock_class(sd, inode);
+
+	if (init) {
 		error = init(inode);
-	if (!error) {
-		d_instantiate(dentry, inode);
-		if (S_ISDIR(mode) || S_ISLNK(mode))
-			dget(dentry);  /* pin link and directory dentries in core */
-	} else
-		iput(inode);
- Done:
+		if (error) {
+			iput(inode);
+			return error;
+		}
+	}
+	d_instantiate(dentry, inode);
+	if (S_ISDIR(mode) || S_ISLNK(mode))
+		dget(dentry);  /* pin link and directory dentries in core */
 	return error;
 }
 
@@ -291,7 +289,7 @@ int __init configfs_inode_init(void)
 	return bdi_init(&configfs_backing_dev_info);
 }
 
-void __exit configfs_inode_exit(void)
+void configfs_inode_exit(void)
 {
 	bdi_destroy(&configfs_backing_dev_info);
 }
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 76dc4c3e5d5..e65f9ffbb99 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -19,11 +19,11 @@
  * Boston, MA 021110-1307, USA.
  *
  * Based on kobject:
- * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *	kobject is Copyright (c) 2002-2003 Patrick Mochel
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please see the file Documentation/filesystems/configfs.txt for
+ * Please see the file Documentation/filesystems/configfs/configfs.txt for
  * critical information about using the config_item interface.
  */
 
@@ -35,9 +35,9 @@
 #include <linux/configfs.h>
 
 
-static inline struct config_item * to_item(struct list_head * entry)
+static inline struct config_item *to_item(struct list_head *entry)
 {
-	return container_of(entry,struct config_item,ci_entry);
+	return container_of(entry, struct config_item, ci_entry);
 }
 
 /* Evil kernel */
@@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref);
  *	config_item_init - initialize item.
  *	@item:	item in question.
  */
-void config_item_init(struct config_item * item)
+void config_item_init(struct config_item *item)
 {
 	kref_init(&item->ci_kref);
 	INIT_LIST_HEAD(&item->ci_entry);
 }
+EXPORT_SYMBOL(config_item_init);
 
 /**
  *	config_item_set_name - Set the name of an item
  *	@item:	item.
- *	@name:	name.
+ *	@fmt:  The vsnprintf()'s format string.
  *
  *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
  *	dynamically allocated string that @item->ci_name points to.
  *	Otherwise, use the static @item->ci_namebuf array.
  */
-int config_item_set_name(struct config_item * item, const char * fmt, ...)
+int config_item_set_name(struct config_item *item, const char *fmt, ...)
 {
 	int error = 0;
 	int limit = CONFIGFS_ITEM_NAME_LEN;
 	int need;
 	va_list args;
-	char * name;
+	char *name;
 
 	/*
 	 * First, try the static array
 	 */
-	va_start(args,fmt);
-	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_start(args, fmt);
+	need = vsnprintf(item->ci_namebuf, limit, fmt, args);
 	va_end(args);
 	if (need < limit)
 		name = item->ci_namebuf;
@@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...)
 		 * Need more space? Allocate it and try again
 		 */
 		limit = need + 1;
-		name = kmalloc(limit,GFP_KERNEL);
+		name = kmalloc(limit, GFP_KERNEL);
 		if (!name) {
 			error = -ENOMEM;
 			goto Done;
 		}
-		va_start(args,fmt);
-		need = vsnprintf(name,limit,fmt,args);
+		va_start(args, fmt);
+		need = vsnprintf(name, limit, fmt, args);
 		va_end(args);
 
 		/* Still? Give up. */
@@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...)
  Done:
 	return error;
 }
-
 EXPORT_SYMBOL(config_item_set_name);
 
 void config_item_init_type_name(struct config_item *item,
@@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name,
 }
 EXPORT_SYMBOL(config_group_init_type_name);
 
-struct config_item * config_item_get(struct config_item * item)
+struct config_item *config_item_get(struct config_item *item)
 {
 	if (item)
 		kref_get(&item->ci_kref);
 	return item;
 }
+EXPORT_SYMBOL(config_item_get);
 
-static void config_item_cleanup(struct config_item * item)
+static void config_item_cleanup(struct config_item *item)
 {
-	struct config_item_type * t = item->ci_type;
-	struct config_group * s = item->ci_group;
-	struct config_item * parent = item->ci_parent;
+	struct config_item_type *t = item->ci_type;
+	struct config_group *s = item->ci_group;
+	struct config_item *parent = item->ci_parent;
 
-	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	pr_debug("config_item %s: cleaning up\n", config_item_name(item));
 	if (item->ci_name != item->ci_namebuf)
 		kfree(item->ci_name);
 	item->ci_name = NULL;
@@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref)
  *
  *	Decrement the refcount, and if 0, call config_item_cleanup().
  */
-void config_item_put(struct config_item * item)
+void config_item_put(struct config_item *item)
 {
 	if (item)
 		kref_put(&item->ci_kref, config_item_release);
 }
+EXPORT_SYMBOL(config_item_put);
 
 /**
  *	config_group_init - initialize a group for use
- *	@k:	group
+ *	@group:	config_group
  */
 void config_group_init(struct config_group *group)
 {
 	config_item_init(&group->cg_item);
 	INIT_LIST_HEAD(&group->cg_children);
 }
+EXPORT_SYMBOL(config_group_init);
 
 /**
  *	config_group_find_item - search for item in group.
@@ -195,11 +198,11 @@ void config_group_init(struct config_group *group)
 struct config_item *config_group_find_item(struct config_group *group,
 					   const char *name)
 {
-	struct list_head * entry;
-	struct config_item * ret = NULL;
+	struct list_head *entry;
+	struct config_item *ret = NULL;
 
-	list_for_each(entry,&group->cg_children) {
-		struct config_item * item = to_item(entry);
+	list_for_each(entry, &group->cg_children) {
+		struct config_item *item = to_item(entry);
 		if (config_item_name(item) &&
 		    !strcmp(config_item_name(item), name)) {
 			ret = config_item_get(item);
@@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group,
 	}
 	return ret;
 }
-
-EXPORT_SYMBOL(config_item_init);
-EXPORT_SYMBOL(config_group_init);
-EXPORT_SYMBOL(config_item_get);
-EXPORT_SYMBOL(config_item_put);
 EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index ecc62178bed..f6c28583339 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -37,8 +37,7 @@
 /* Random magic number */
 #define CONFIGFS_MAGIC 0x62656570
 
-struct vfsmount * configfs_mount = NULL;
-struct super_block * configfs_sb = NULL;
+static struct vfsmount *configfs_mount = NULL;
 struct kmem_cache *configfs_dir_cachep;
 static int configfs_mnt_count = 0;
 
@@ -77,24 +76,22 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = CONFIGFS_MAGIC;
 	sb->s_op = &configfs_ops;
 	sb->s_time_gran = 1;
-	configfs_sb = sb;
 
 	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
-				   &configfs_root);
+				   &configfs_root, sb);
 	if (inode) {
-		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_op = &configfs_root_inode_operations;
 		inode->i_fop = &configfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
 		inc_nlink(inode);
 	} else {
-		pr_debug("configfs: could not get root inode\n");
+		pr_debug("could not get root inode\n");
 		return -ENOMEM;
 	}
 
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root) {
 		pr_debug("%s: could not get root dentry!\n",__func__);
-		iput(inode);
 		return -ENOMEM;
 	}
 	config_group_init(&configfs_root_group);
@@ -117,11 +114,13 @@ static struct file_system_type configfs_fs_type = {
 	.mount		= configfs_do_mount,
 	.kill_sb	= kill_litter_super,
 };
+MODULE_ALIAS_FS("configfs");
 
-int configfs_pin_fs(void)
+struct dentry *configfs_pin_fs(void)
 {
-	return simple_pin_fs(&configfs_fs_type, &configfs_mount,
+	int err = simple_pin_fs(&configfs_fs_type, &configfs_mount,
 			     &configfs_mnt_count);
+	return err ? ERR_PTR(err) : configfs_mount->mnt_root;
 }
 
 void configfs_release_fs(void)
@@ -143,28 +142,26 @@ static int __init configfs_init(void)
 		goto out;
 
 	config_kobj = kobject_create_and_add("config", kernel_kobj);
-	if (!config_kobj) {
-		kmem_cache_destroy(configfs_dir_cachep);
-		configfs_dir_cachep = NULL;
-		goto out;
-	}
+	if (!config_kobj)
+		goto out2;
+
+	err = configfs_inode_init();
+	if (err)
+		goto out3;
 
 	err = register_filesystem(&configfs_fs_type);
-	if (err) {
-		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
-		kobject_put(config_kobj);
-		kmem_cache_destroy(configfs_dir_cachep);
-		configfs_dir_cachep = NULL;
-		goto out;
-	}
+	if (err)
+		goto out4;
 
-	err = configfs_inode_init();
-	if (err) {
-		unregister_filesystem(&configfs_fs_type);
-		kobject_put(config_kobj);
-		kmem_cache_destroy(configfs_dir_cachep);
-		configfs_dir_cachep = NULL;
-	}
+	return 0;
+out4:
+	pr_err("Unable to register filesystem!\n");
+	configfs_inode_exit();
+out3:
+	kobject_put(config_kobj);
+out2:
+	kmem_cache_destroy(configfs_dir_cachep);
+	configfs_dir_cachep = NULL;
 out:
 	return err;
 }
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0f3eb41d920..cc9f2546ea4 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -110,13 +110,13 @@ out:
 
 
 static int get_target(const char *symname, struct path *path,
-		      struct config_item **target)
+		      struct config_item **target, struct super_block *sb)
 {
 	int ret;
 
 	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
 	if (!ret) {
-		if (path->dentry->d_sb == configfs_sb) {
+		if (path->dentry->d_sb == sb) {
 			*target = configfs_get_config_item(path->dentry);
 			if (!*target) {
 				ret = -ENOENT;
@@ -141,10 +141,6 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	struct config_item *target_item = NULL;
 	struct config_item_type *type;
 
-	ret = -EPERM;  /* What lack-of-symlink returns */
-	if (dentry->d_parent == configfs_sb->s_root)
-		goto out;
-
 	sd = dentry->d_parent->d_fsdata;
 	/*
 	 * Fake invisibility if dir belongs to a group/default groups hierarchy
@@ -162,7 +158,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	    !type->ct_item_ops->allow_link)
 		goto out_put;
 
-	ret = get_target(symname, &path, &target_item);
+	ret = get_target(symname, &path, &target_item, dentry->d_sb);
 	if (ret)
 		goto out_put;
 
@@ -198,8 +194,6 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
 		goto out;
 
-	BUG_ON(dentry->d_parent == configfs_sb->s_root);
-
 	sl = sd->s_element;
 
 	parent_item = configfs_get_config_item(dentry->d_parent);
diff --git a/fs/coredump.c b/fs/coredump.c
new file mode 100644
index 00000000000..a93f7e6ea4c
--- /dev/null
+++ b/fs/coredump.c
@@ -0,0 +1,743 @@
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/coredump.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
+#include "internal.h"
+
+#include <trace/events/sched.h>
+
+int core_uses_pid;
+unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
+
+struct core_name {
+	char *corename;
+	int used, size;
+};
+
+/* The maximal length of core_pattern is also specified in sysctl.c */
+
+static int expand_corename(struct core_name *cn, int size)
+{
+	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
+
+	if (!corename)
+		return -ENOMEM;
+
+	if (size > core_name_size) /* racy but harmless */
+		core_name_size = size;
+
+	cn->size = ksize(corename);
+	cn->corename = corename;
+	return 0;
+}
+
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+{
+	int free, need;
+	va_list arg_copy;
+
+again:
+	free = cn->size - cn->used;
+
+	va_copy(arg_copy, arg);
+	need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
+	va_end(arg_copy);
+
+	if (need < free) {
+		cn->used += need;
+		return 0;
+	}
+
+	if (!expand_corename(cn, cn->size + need - free + 1))
+		goto again;
+
+	return -ENOMEM;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	va_list arg;
+	int ret;
+
+	va_start(arg, fmt);
+	ret = cn_vprintf(cn, fmt, arg);
+	va_end(arg);
+
+	return ret;
+}
+
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+{
+	int cur = cn->used;
+	va_list arg;
+	int ret;
+
+	va_start(arg, fmt);
+	ret = cn_vprintf(cn, fmt, arg);
+	va_end(arg);
+
+	for (; cur < cn->used; ++cur) {
+		if (cn->corename[cur] == '/')
+			cn->corename[cur] = '!';
+	}
+	return ret;
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+	struct file *exe_file;
+	char *pathbuf, *path;
+	int ret;
+
+	exe_file = get_mm_exe_file(current->mm);
+	if (!exe_file)
+		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+	if (!pathbuf) {
+		ret = -ENOMEM;
+		goto put_exe_file;
+	}
+
+	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	if (IS_ERR(path)) {
+		ret = PTR_ERR(path);
+		goto free_buf;
+	}
+
+	ret = cn_esc_printf(cn, "%s", path);
+
+free_buf:
+	kfree(pathbuf);
+put_exe_file:
+	fput(exe_file);
+	return ret;
+}
+
+/* format_corename will inspect the pattern parameter, and output a
+ * name into corename, which must have space for at least
+ * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+ */
+static int format_corename(struct core_name *cn, struct coredump_params *cprm)
+{
+	const struct cred *cred = current_cred();
+	const char *pat_ptr = core_pattern;
+	int ispipe = (*pat_ptr == '|');
+	int pid_in_pattern = 0;
+	int err = 0;
+
+	cn->used = 0;
+	cn->corename = NULL;
+	if (expand_corename(cn, core_name_size))
+		return -ENOMEM;
+	cn->corename[0] = '\0';
+
+	if (ispipe)
+		++pat_ptr;
+
+	/* Repeat as long as we have more pattern to process and more output
+	   space */
+	while (*pat_ptr) {
+		if (*pat_ptr != '%') {
+			err = cn_printf(cn, "%c", *pat_ptr++);
+		} else {
+			switch (*++pat_ptr) {
+			/* single % at the end, drop that */
+			case 0:
+				goto out;
+			/* Double percent, output one percent */
+			case '%':
+				err = cn_printf(cn, "%c", '%');
+				break;
+			/* pid */
+			case 'p':
+				pid_in_pattern = 1;
+				err = cn_printf(cn, "%d",
+					      task_tgid_vnr(current));
+				break;
+			/* global pid */
+			case 'P':
+				err = cn_printf(cn, "%d",
+					      task_tgid_nr(current));
+				break;
+			/* uid */
+			case 'u':
+				err = cn_printf(cn, "%d", cred->uid);
+				break;
+			/* gid */
+			case 'g':
+				err = cn_printf(cn, "%d", cred->gid);
+				break;
+			case 'd':
+				err = cn_printf(cn, "%d",
+					__get_dumpable(cprm->mm_flags));
+				break;
+			/* signal that caused the coredump */
+			case 's':
+				err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
+				break;
+			/* UNIX time of coredump */
+			case 't': {
+				struct timeval tv;
+				do_gettimeofday(&tv);
+				err = cn_printf(cn, "%lu", tv.tv_sec);
+				break;
+			}
+			/* hostname */
+			case 'h':
+				down_read(&uts_sem);
+				err = cn_esc_printf(cn, "%s",
+					      utsname()->nodename);
+				up_read(&uts_sem);
+				break;
+			/* executable */
+			case 'e':
+				err = cn_esc_printf(cn, "%s", current->comm);
+				break;
+			case 'E':
+				err = cn_print_exe_file(cn);
+				break;
+			/* core limit size */
+			case 'c':
+				err = cn_printf(cn, "%lu",
+					      rlimit(RLIMIT_CORE));
+				break;
+			default:
+				break;
+			}
+			++pat_ptr;
+		}
+
+		if (err)
+			return err;
+	}
+
+out:
+	/* Backward compatibility with core_uses_pid:
+	 *
+	 * If core_pattern does not include a %p (as is the default)
+	 * and core_uses_pid is set, then .%pid will be appended to
+	 * the filename. Do not do this for piped commands. */
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
+		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+		if (err)
+			return err;
+	}
+	return ispipe;
+}
+
+static int zap_process(struct task_struct *start, int exit_code)
+{
+	struct task_struct *t;
+	int nr = 0;
+
+	start->signal->group_exit_code = exit_code;
+	start->signal->group_stop_count = 0;
+
+	t = start;
+	do {
+		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+		if (t != current && t->mm) {
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
+			nr++;
+		}
+	} while_each_thread(start, t);
+
+	return nr;
+}
+
+static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+			struct core_state *core_state, int exit_code)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	int nr = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!signal_group_exit(tsk->signal)) {
+		mm->core_state = core_state;
+		nr = zap_process(tsk, exit_code);
+		tsk->signal->group_exit_task = tsk;
+		/* ignore all signals except SIGKILL, see prepare_signal() */
+		tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (unlikely(nr < 0))
+		return nr;
+
+	tsk->flags |= PF_DUMPCORE;
+	if (atomic_read(&mm->mm_users) == nr + 1)
+		goto done;
+	/*
+	 * We should find and kill all tasks which use this mm, and we should
+	 * count them correctly into ->nr_threads. We don't take tasklist
+	 * lock, but this is safe wrt:
+	 *
+	 * fork:
+	 *	None of sub-threads can fork after zap_process(leader). All
+	 *	processes which were created before this point should be
+	 *	visible to zap_threads() because copy_process() adds the new
+	 *	process to the tail of init_task.tasks list, and lock/unlock
+	 *	of ->siglock provides a memory barrier.
+	 *
+	 * do_exit:
+	 *	The caller holds mm->mmap_sem. This means that the task which
+	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
+	 *	its ->mm.
+	 *
+	 * de_thread:
+	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
+	 *	we must see either old or new leader, this does not matter.
+	 *	However, it can change p->sighand, so lock_task_sighand(p)
+	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
+	 *	it can't fail.
+	 *
+	 *	Note also that "g" can be the old leader with ->mm == NULL
+	 *	and already unhashed and thus removed from ->thread_group.
+	 *	This is OK, __unhash_process()->list_del_rcu() does not
+	 *	clear the ->next pointer, we will find the new leader via
+	 *	next_thread().
+	 */
+	rcu_read_lock();
+	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+		if (g->flags & PF_KTHREAD)
+			continue;
+		p = g;
+		do {
+			if (p->mm) {
+				if (unlikely(p->mm == mm)) {
+					lock_task_sighand(p, &flags);
+					nr += zap_process(p, exit_code);
+					p->signal->flags = SIGNAL_GROUP_EXIT;
+					unlock_task_sighand(p, &flags);
+				}
+				break;
+			}
+		} while_each_thread(g, p);
+	}
+	rcu_read_unlock();
+done:
+	atomic_set(&core_state->nr_threads, nr);
+	return nr;
+}
+
+static int coredump_wait(int exit_code, struct core_state *core_state)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	int core_waiters = -EBUSY;
+
+	init_completion(&core_state->startup);
+	core_state->dumper.task = tsk;
+	core_state->dumper.next = NULL;
+
+	down_write(&mm->mmap_sem);
+	if (!mm->core_state)
+		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+	up_write(&mm->mmap_sem);
+
+	if (core_waiters > 0) {
+		struct core_thread *ptr;
+
+		wait_for_completion(&core_state->startup);
+		/*
+		 * Wait for all the threads to become inactive, so that
+		 * all the thread context (extended register state, like
+		 * fpu etc) gets copied to the memory.
+		 */
+		ptr = core_state->dumper.next;
+		while (ptr != NULL) {
+			wait_task_inactive(ptr->task, 0);
+			ptr = ptr->next;
+		}
+	}
+
+	return core_waiters;
+}
+
+static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+{
+	struct core_thread *curr, *next;
+	struct task_struct *task;
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (core_dumped && !__fatal_signal_pending(current))
+		current->signal->group_exit_code |= 0x80;
+	current->signal->group_exit_task = NULL;
+	current->signal->flags = SIGNAL_GROUP_EXIT;
+	spin_unlock_irq(&current->sighand->siglock);
+
+	next = mm->core_state->dumper.next;
+	while ((curr = next) != NULL) {
+		next = curr->next;
+		task = curr->task;
+		/*
+		 * see exit_mm(), curr->task must not see
+		 * ->task == NULL before we read ->next.
+		 */
+		smp_mb();
+		curr->task = NULL;
+		wake_up_process(task);
+	}
+
+	mm->core_state = NULL;
+}
+
+static bool dump_interrupted(void)
+{
+	/*
+	 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
+	 * can do try_to_freeze() and check __fatal_signal_pending(),
+	 * but then we need to teach dump_write() to restart and clear
+	 * TIF_SIGPENDING.
+	 */
+	return signal_pending(current);
+}
+
+static void wait_for_dump_helpers(struct file *file)
+{
+	struct pipe_inode_info *pipe = file->private_data;
+
+	pipe_lock(pipe);
+	pipe->readers++;
+	pipe->writers--;
+	wake_up_interruptible_sync(&pipe->wait);
+	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	pipe_unlock(pipe);
+
+	/*
+	 * We actually want wait_event_freezable() but then we need
+	 * to clear TIF_SIGPENDING and improve dump_interrupted().
+	 */
+	wait_event_interruptible(pipe->wait, pipe->readers == 1);
+
+	pipe_lock(pipe);
+	pipe->readers--;
+	pipe->writers++;
+	pipe_unlock(pipe);
+}
+
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct file *files[2];
+	struct coredump_params *cp = (struct coredump_params *)info->data;
+	int err = create_pipe_files(files, 0);
+	if (err)
+		return err;
+
+	cp->file = files[1];
+
+	err = replace_fd(0, files[0], 0);
+	fput(files[0]);
+	/* and disallow core files too */
+	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+	return err;
+}
+
+void do_coredump(const siginfo_t *siginfo)
+{
+	struct core_state core_state;
+	struct core_name cn;
+	struct mm_struct *mm = current->mm;
+	struct linux_binfmt * binfmt;
+	const struct cred *old_cred;
+	struct cred *cred;
+	int retval = 0;
+	int flag = 0;
+	int ispipe;
+	struct files_struct *displaced;
+	bool need_nonrelative = false;
+	bool core_dumped = false;
+	static atomic_t core_dump_count = ATOMIC_INIT(0);
+	struct coredump_params cprm = {
+		.siginfo = siginfo,
+		.regs = signal_pt_regs(),
+		.limit = rlimit(RLIMIT_CORE),
+		/*
+		 * We must use the same mm->flags while dumping core to avoid
+		 * inconsistency of bit flags, since this flag is not protected
+		 * by any locks.
+		 */
+		.mm_flags = mm->flags,
+	};
+
+	audit_core_dumps(siginfo->si_signo);
+
+	binfmt = mm->binfmt;
+	if (!binfmt || !binfmt->core_dump)
+		goto fail;
+	if (!__get_dumpable(cprm.mm_flags))
+		goto fail;
+
+	cred = prepare_creds();
+	if (!cred)
+		goto fail;
+	/*
+	 * We cannot trust fsuid as being the "true" uid of the process
+	 * nor do we know its entire history. We only know it was tainted
+	 * so we dump it as root in mode 2, and only into a controlled
+	 * environment (pipe handler or fully qualified path).
+	 */
+	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
+		/* Setuid core dump mode */
+		flag = O_EXCL;		/* Stop rewrite attacks */
+		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
+		need_nonrelative = true;
+	}
+
+	retval = coredump_wait(siginfo->si_signo, &core_state);
+	if (retval < 0)
+		goto fail_creds;
+
+	old_cred = override_creds(cred);
+
+	ispipe = format_corename(&cn, &cprm);
+
+	if (ispipe) {
+		int dump_count;
+		char **helper_argv;
+		struct subprocess_info *sub_info;
+
+		if (ispipe < 0) {
+			printk(KERN_WARNING "format_corename failed\n");
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+
+		if (cprm.limit == 1) {
+			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+			 *
+			 * Normally core limits are irrelevant to pipes, since
+			 * we're not writing to the file system, but we use
+			 * cprm.limit of 1 here as a speacial value, this is a
+			 * consistent way to catch recursive crashes.
+			 * We can still crash if the core_pattern binary sets
+			 * RLIM_CORE = !1, but it runs as root, and can do
+			 * lots of stupid things.
+			 *
+			 * Note that we use task_tgid_vnr here to grab the pid
+			 * of the process group leader.  That way we get the
+			 * right pid if a thread in a multi-threaded
+			 * core_pattern process dies.
+			 */
+			printk(KERN_WARNING
+				"Process %d(%s) has RLIMIT_CORE set to 1\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+		cprm.limit = RLIM_INFINITY;
+
+		dump_count = atomic_inc_return(&core_dump_count);
+		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+			       task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_dropcount;
+		}
+
+		helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
+		if (!helper_argv) {
+			printk(KERN_WARNING "%s failed to allocate memory\n",
+			       __func__);
+			goto fail_dropcount;
+		}
+
+		retval = -ENOMEM;
+		sub_info = call_usermodehelper_setup(helper_argv[0],
+						helper_argv, NULL, GFP_KERNEL,
+						umh_pipe_setup, NULL, &cprm);
+		if (sub_info)
+			retval = call_usermodehelper_exec(sub_info,
+							  UMH_WAIT_EXEC);
+
+		argv_free(helper_argv);
+		if (retval) {
+			printk(KERN_INFO "Core dump to |%s pipe failed\n",
+			       cn.corename);
+			goto close_fail;
+		}
+	} else {
+		struct inode *inode;
+
+		if (cprm.limit < binfmt->min_coredump)
+			goto fail_unlock;
+
+		if (need_nonrelative && cn.corename[0] != '/') {
+			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
+				"to fully qualified path!\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_unlock;
+		}
+
+		cprm.file = filp_open(cn.corename,
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+				 0600);
+		if (IS_ERR(cprm.file))
+			goto fail_unlock;
+
+		inode = file_inode(cprm.file);
+		if (inode->i_nlink > 1)
+			goto close_fail;
+		if (d_unhashed(cprm.file->f_path.dentry))
+			goto close_fail;
+		/*
+		 * AK: actually i see no reason to not allow this for named
+		 * pipes etc, but keep the previous behaviour for now.
+		 */
+		if (!S_ISREG(inode->i_mode))
+			goto close_fail;
+		/*
+		 * Dont allow local users get cute and trick others to coredump
+		 * into their pre-created files.
+		 */
+		if (!uid_eq(inode->i_uid, current_fsuid()))
+			goto close_fail;
+		if (!cprm.file->f_op->write)
+			goto close_fail;
+		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+			goto close_fail;
+	}
+
+	/* get us an unshared descriptor table; almost always a no-op */
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto close_fail;
+	if (displaced)
+		put_files_struct(displaced);
+	if (!dump_interrupted()) {
+		file_start_write(cprm.file);
+		core_dumped = binfmt->core_dump(&cprm);
+		file_end_write(cprm.file);
+	}
+	if (ispipe && core_pipe_limit)
+		wait_for_dump_helpers(cprm.file);
+close_fail:
+	if (cprm.file)
+		filp_close(cprm.file, NULL);
+fail_dropcount:
+	if (ispipe)
+		atomic_dec(&core_dump_count);
+fail_unlock:
+	kfree(cn.corename);
+	coredump_finish(mm, core_dumped);
+	revert_creds(old_cred);
+fail_creds:
+	put_cred(cred);
+fail:
+	return;
+}
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+{
+	struct file *file = cprm->file;
+	loff_t pos = file->f_pos;
+	ssize_t n;
+	if (cprm->written + nr > cprm->limit)
+		return 0;
+	while (nr) {
+		if (dump_interrupted())
+			return 0;
+		n = __kernel_write(file, addr, nr, &pos);
+		if (n <= 0)
+			return 0;
+		file->f_pos = pos;
+		cprm->written += n;
+		nr -= n;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(dump_emit);
+
+int dump_skip(struct coredump_params *cprm, size_t nr)
+{
+	static char zeroes[PAGE_SIZE];
+	struct file *file = cprm->file;
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (cprm->written + nr > cprm->limit)
+			return 0;
+		if (dump_interrupted() ||
+		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
+			return 0;
+		cprm->written += nr;
+		return 1;
+	} else {
+		while (nr > PAGE_SIZE) {
+			if (!dump_emit(cprm, zeroes, PAGE_SIZE))
+				return 0;
+			nr -= PAGE_SIZE;
+		}
+		return dump_emit(cprm, zeroes, nr);
+	}
+}
+EXPORT_SYMBOL(dump_skip);
+
+int dump_align(struct coredump_params *cprm, int align)
+{
+	unsigned mod = cprm->written & (align - 1);
+	if (align & (align - 1))
+		return 0;
+	return mod ? dump_skip(cprm, align - mod) : 1;
+}
+EXPORT_SYMBOL(dump_align);
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index cd06466f365..11b29d491b7 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,5 +1,5 @@
 config CRAMFS
-	tristate "Compressed ROM file system support (cramfs)"
+	tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
 	depends on BLOCK
 	select ZLIB_INFLATE
 	help
@@ -16,4 +16,7 @@ config CRAMFS
 	  cramfs.  Note that the root file system (the one containing the
 	  directory /) cannot be compiled as a module.
 
+	  This filesystem is obsoleted by SquashFS, which is much better
+	  in terms of performance and features.
+
 	  If unsure, say N.
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e141939080f..ddcfe590b8a 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,15 +17,30 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
-#include <linux/cramfs_fs.h>
 #include <linux/slab.h>
-#include <linux/cramfs_fs_sb.h>
-#include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
-
+#include <uapi/linux/cramfs_fs.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
+/*
+ * cramfs super-block data in memory
+ */
+struct cramfs_sb_info {
+	unsigned long magic;
+	unsigned long size;
+	unsigned long blocks;
+	unsigned long files;
+	unsigned long flags;
+};
+
+static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
@@ -37,7 +52,7 @@ static DEFINE_MUTEX(read_mutex);
 /* These macros may change in future, to provide better st_ino semantics. */
 #define OFFSET(x)	((x)->i_ino)
 
-static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
+static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset)
 {
 	if (!cino->offset)
 		return offset + 1;
@@ -61,7 +76,7 @@ static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
 }
 
 static struct inode *get_cramfs_inode(struct super_block *sb,
-	struct cramfs_inode *cramfs_inode, unsigned int offset)
+	const struct cramfs_inode *cramfs_inode, unsigned int offset)
 {
 	struct inode *inode;
 	static struct timespec zerotime;
@@ -91,8 +106,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	}
 
 	inode->i_mode = cramfs_inode->mode;
-	inode->i_uid = cramfs_inode->uid;
-	inode->i_gid = cramfs_inode->gid;
+	i_uid_write(inode, cramfs_inode->uid);
+	i_gid_write(inode, cramfs_inode->gid);
 
 	/* if the lower 2 bits are zero, the inode contains data */
 	if (!(inode->i_ino & 3)) {
@@ -180,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_mapping_page_async(mapping, blocknr + i,
-									NULL);
+			page = read_mapping_page(mapping, blocknr + i, NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
@@ -220,14 +234,16 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 	return read_buffers[buffer] + offset;
 }
 
-static void cramfs_put_super(struct super_block *sb)
+static void cramfs_kill_sb(struct super_block *sb)
 {
-	kfree(sb->s_fs_info);
-	sb->s_fs_info = NULL;
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+	kill_block_super(sb);
+	kfree(sbi);
 }
 
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
+	sync_filesystem(sb);
 	*flags |= MS_RDONLY;
 	return 0;
 }
@@ -258,11 +274,11 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Do sanity checks on the superblock */
 	if (super.magic != CRAMFS_MAGIC) {
-		/* check for wrong endianess */
+		/* check for wrong endianness */
 		if (super.magic == CRAMFS_MAGIC_WEND) {
 			if (!silent)
-				printk(KERN_ERR "cramfs: wrong endianess\n");
-			goto out;
+				printk(KERN_ERR "cramfs: wrong endianness\n");
+			return -EINVAL;
 		}
 
 		/* check at 512 byte offset */
@@ -271,23 +287,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		mutex_unlock(&read_mutex);
 		if (super.magic != CRAMFS_MAGIC) {
 			if (super.magic == CRAMFS_MAGIC_WEND && !silent)
-				printk(KERN_ERR "cramfs: wrong endianess\n");
+				printk(KERN_ERR "cramfs: wrong endianness\n");
 			else if (!silent)
 				printk(KERN_ERR "cramfs: wrong magic\n");
-			goto out;
+			return -EINVAL;
 		}
 	}
 
 	/* get feature flags first */
 	if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
 		printk(KERN_ERR "cramfs: unsupported filesystem features\n");
-		goto out;
+		return -EINVAL;
 	}
 
 	/* Check that the root inode is in a sane state */
 	if (!S_ISDIR(super.root.mode)) {
 		printk(KERN_ERR "cramfs: root is not a directory\n");
-		goto out;
+		return -EINVAL;
 	}
 	/* correct strange, hard-coded permissions of mkcramfs */
 	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
@@ -311,24 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		  (root_offset != 512 + sizeof(struct cramfs_super))))
 	{
 		printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
-		goto out;
+		return -EINVAL;
 	}
 
 	/* Set it all up.. */
 	sb->s_op = &cramfs_ops;
 	root = get_cramfs_inode(sb, &super.root, 0);
-	if (!root)
-		goto out;
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
-		goto out;
-	}
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root)
+		return -ENOMEM;
 	return 0;
-out:
-	kfree(sbi);
-	sb->s_fs_info = NULL;
-	return -EINVAL;
 }
 
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -352,18 +362,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 /*
  * Read a cramfs directory entry.
  */
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	char *buf;
 	unsigned int offset;
-	int copied;
 
 	/* Offset within the thing. */
-	offset = filp->f_pos;
-	if (offset >= inode->i_size)
+	if (ctx->pos >= inode->i_size)
 		return 0;
+	offset = ctx->pos;
 	/* Directory entries are always 4-byte aligned */
 	if (offset & 3)
 		return -EINVAL;
@@ -372,14 +381,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (!buf)
 		return -ENOMEM;
 
-	copied = 0;
 	while (offset < inode->i_size) {
 		struct cramfs_inode *de;
 		unsigned long nextoffset;
 		char *name;
 		ino_t ino;
-		mode_t mode;
-		int namelen, error;
+		umode_t mode;
+		int namelen;
 
 		mutex_lock(&read_mutex);
 		de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -405,13 +413,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				break;
 			namelen--;
 		}
-		error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
-		if (error)
+		if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
 			break;
 
-		offset = nextoffset;
-		filp->f_pos = offset;
-		copied++;
+		ctx->pos = offset = nextoffset;
 	}
 	kfree(buf);
 	return 0;
@@ -420,9 +425,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /*
  * Lookup and fill in the inode data..
  */
-static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	unsigned int offset = 0;
+	struct inode *inode = NULL;
 	int sorted;
 
 	mutex_lock(&read_mutex);
@@ -449,8 +455,8 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 
 		for (;;) {
 			if (!namelen) {
-				mutex_unlock(&read_mutex);
-				return ERR_PTR(-EIO);
+				inode = ERR_PTR(-EIO);
+				goto out;
 			}
 			if (name[namelen-1])
 				break;
@@ -462,17 +468,18 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 		if (retval > 0)
 			continue;
 		if (!retval) {
-			struct cramfs_inode entry = *de;
-			mutex_unlock(&read_mutex);
-			d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
-			return NULL;
+			inode = get_cramfs_inode(dir->i_sb, de, dir_off);
+			break;
 		}
 		/* else (retval < 0) */
 		if (sorted)
 			break;
 	}
+out:
 	mutex_unlock(&read_mutex);
-	d_add(dentry, NULL);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	d_add(dentry, inode);
 	return NULL;
 }
 
@@ -548,7 +555,7 @@ static const struct address_space_operations cramfs_aops = {
 static const struct file_operations cramfs_directory_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= cramfs_readdir,
+	.iterate	= cramfs_readdir,
 };
 
 static const struct inode_operations cramfs_dir_inode_operations = {
@@ -556,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = {
 };
 
 static const struct super_operations cramfs_ops = {
-	.put_super	= cramfs_put_super,
 	.remount_fs	= cramfs_remount,
 	.statfs		= cramfs_statfs,
 };
@@ -571,9 +577,10 @@ static struct file_system_type cramfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "cramfs",
 	.mount		= cramfs_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= cramfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("cramfs");
 
 static int __init init_cramfs_fs(void)
 {
diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h
new file mode 100644
index 00000000000..349d7127215
--- /dev/null
+++ b/fs/cramfs/internal.h
@@ -0,0 +1,4 @@
+/* Uncompression interfaces to the underlying zlib */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
+int cramfs_uncompress_init(void);
+void cramfs_uncompress_exit(void);
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c